From 8b8defc149d5b0031169aaed210b629de621340a Mon Sep 17 00:00:00 2001 From: wgilmart Date: Fri, 9 Aug 2019 10:34:32 -0700 Subject: [PATCH] add tuned logic files for resnet and inception sizes --- .../archive/vega20_Cijk_Ailk_Bjlk_SB.yaml | 16562 +++- .../archive/vega20_Cijk_Ailk_Bljk_SB.yaml | 12964 ++- .../archive/vega20_Cijk_Alik_Bljk_SB.yaml | 18556 +++- .../asm_ci/vega20_Cijk_Ailk_Bjlk_SB.yaml | 59908 +++++++++--- .../asm_ci/vega20_Cijk_Ailk_Bljk_SB.yaml | 76970 +++++++++++----- .../asm_ci/vega20_Cijk_Alik_Bljk_SB.yaml | 71762 ++++++++++---- .../asm_full/vega20_Cijk_Ailk_Bjlk_SB.yaml | 59908 +++++++++--- .../asm_full/vega20_Cijk_Ailk_Bljk_SB.yaml | 76970 +++++++++++----- .../asm_full/vega20_Cijk_Alik_Bljk_SB.yaml | 71762 ++++++++++---- scripts/performance/sgemm-resnet-inception.sh | 375 + scripts/performance/sgemm-tf-inception.sh | 375 + 11 files changed, 353196 insertions(+), 112916 deletions(-) create mode 100644 scripts/performance/sgemm-resnet-inception.sh create mode 100644 scripts/performance/sgemm-tf-inception.sh diff --git a/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml b/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml index fba37ff22..4b91ed1df 100644 --- a/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml @@ -47907,6 +47907,16298 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 300 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 301 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 302 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 303 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 304 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 305 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 306 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 307 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 308 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 309 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 310 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 311 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 312 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 313 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 314 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 315 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 316 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 317 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 318 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 319 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 320 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 321 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 322 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 323 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 324 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 325 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 326 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_6_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 327 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 328 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 329 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 330 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 331 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 332 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 333 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 334 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 335 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_DTL0_EPS0_FL0_GRVW4_PGR0_PLR0_TT4_8_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 336 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 337 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 338 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 339 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 340 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 341 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 342 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 343 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 344 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 345 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 346 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 347 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 348 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 349 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 350 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 351 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 352 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 353 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 354 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 355 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 356 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 357 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 358 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 359 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 360 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 361 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 362 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 363 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 364 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 365 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 366 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 367 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 368 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 369 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 370 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 371 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 372 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL1_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 32 + LVCB: 128 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: true + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 373 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x128x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 374 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 375 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 376 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 377 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG8_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 378 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_8_USFGRO0_VW4_WG8_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 379 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 380 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 381 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 382 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 383 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 384 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 385 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 386 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 387 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 388 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 389 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 390 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 391 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 392 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 393 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 394 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - [2, 3, 0, 1] - - - [1024, 128, 1, 128] - [4, 1028.02] @@ -49980,8 +66272,6 @@ - [100, 9061.26] - - [49, 2048, 128, 512] - [98, 6963.26] - - - [784, 512, 64, 128] - - [100, 8822.52] - - [784, 128, 128, 512] - [107, 8983.53] - - [196, 256, 64, 1024] @@ -49992,36 +66282,22 @@ - [99, 8581.25] - - [49, 2048, 256, 512] - [98, 7049.54] - - - [196, 1024, 64, 256] - - [101, 7953.59] - - [784, 128, 256, 512] - [109, 9102.89] - - [196, 256, 128, 1024] - [101, 8085.79] - - - [3136, 64, 64, 256] - - [105, 9266.03] - - - [784, 128, 64, 512] - - [106, 8809.29] - - - [49, 2048, 64, 512] - - [98, 6843.85] - - [3136, 64, 128, 256] - [105, 9381.29] - - [3136, 256, 128, 64] - [103, 8982.54] - - [784, 512, 128, 128] - [100, 8965.89] - - - [3136, 256, 64, 64] - - [103, 8879.7] - - [3136, 64, 256, 256] - [105, 9566.33] - - - [3136, 64, 64, 64] - - [104, 8313.95] - - [3136, 64, 256, 64] - [99, 8743.7] - - [196, 1024, 128, 256] - [102, 8119.33] - - - [49, 512, 64, 2048] - - [110, 7055.31] - - [49, 512, 256, 2048] - [111, 7166.31] - - [196, 1024, 256, 256] @@ -52894,4 +69170,258 @@ - [280, 4689.35] - - [64, 64, 36, 1760] - [230, 5622.24] + - - [196, 528, 32, 32] + - [313, 4088.41] + - - [5329, 64, 32, 80] + - [306, 8331.14] + - - [64, 2880, 1, 320] + - [357, 4362.6] + - - [49, 832, 32, 256] + - [320, 5618.63] + - - [3136, 64, 64, 64] + - [306, 8457.65] + - - [196, 512, 32, 24] + - [307, 3621.73] + - - [289, 1120, 1, 160] + - [303, 3302.86] + - - [1225, 192, 32, 32] + - [311, 6194.57] + - - [64, 2048, 32, 384] + - [334, 9541.54] + - - [1001, 1536, 1, 32] + - [305, 3575.67] + - - [289, 1792, 1, 320] + - [328, 5140.33] + - - [3136, 256, 64, 64] + - [329, 9310.12] + - - [1001, 1024, 1, 32] + - [300, 2733.4] + - - [196, 480, 32, 64] + - [361, 5070.42] + - - [64, 1728, 1, 320] + - [358, 3205.57] + - - [49, 832, 32, 160] + - [362, 4988.82] + - - [49, 2048, 64, 512] + - [332, 7370.31] + - - [49, 832, 32, 384] + - [320, 5901.95] + - - [289, 896, 1, 192] + - [346, 3452.59] + - - [289, 1024, 32, 384] + - [365, 8902.42] + - - [784, 192, 32, 96] + - [376, 7853.63] + - - [50176, 256, 1, 128] + - [339, 9041.83] + - - [289, 1024, 32, 256] + - [374, 8660.72] + - - [289, 1024, 32, 192] + - [363, 8433.35] + - - [12544, 512, 1, 256] + - [323, 9187.34] + - - [1225, 1728, 1, 192] + - [327, 7720.85] + - - [196, 480, 32, 96] + - [372, 5662.5] + - - [196, 512, 32, 144] + - [366, 6531.38] + - - [784, 400, 1, 32] + - [301, 1280.0] + - - [289, 768, 32, 128] + - [367, 7913.61] + - - [5329, 576, 1, 96] + - [310, 7563.46] + - - [49, 1200, 1, 128] + - [354, 1011.61] + - - [64, 1536, 32, 256] + - [368, 9159.54] + - - [289, 2592, 1, 384] + - [336, 6002.71] + - - [196, 528, 32, 128] + - [371, 5987.1] + - - [64, 2048, 32, 448] + - [334, 9669.87] + - - [196, 1024, 64, 256] + - [373, 7818.94] + - - [5329, 448, 1, 64] + - [306, 6201.02] + - - [784, 256, 32, 64] + - [308, 7623.18] + - - [784, 192, 32, 32] + - [313, 5874.26] + - - [21609, 288, 1, 32] + - [326, 5296.5] + - - [784, 256, 32, 32] + - [304, 6235.46] + - - [5041, 720, 1, 192] + - [322, 8140.98] + - - [289, 2016, 1, 256] + - [319, 5404.05] + - - [196, 512, 32, 128] + - [364, 6366.82] + - - [289, 768, 32, 160] + - [366, 8253.88] + - - [64, 1536, 32, 384] + - [337, 9508.5] + - - [64, 1280, 32, 320] + - [337, 9070.73] + - - [289, 896, 1, 128] + - [347, 2917.68] + - - [289, 3456, 1, 384] + - [327, 7274.91] + - - [196, 800, 1, 64] + - [349, 1393.78] + - - [64, 1280, 32, 384] + - [333, 9225.01] + - - [64, 1344, 1, 512] + - [352, 3041.45] + - - [1001, 4096, 1, 512] + - [333, 9391.77] + - - [1225, 192, 32, 64] + - [306, 7729.29] + - - [64, 1152, 1, 384] + - [356, 2440.65] + - - [729, 1600, 1, 192] + - [318, 6827.71] + - - [289, 1344, 1, 192] + - [316, 4439.04] + - - [784, 192, 32, 16] + - [343, 3663.04] + - - [3136, 1024, 1, 2048] + - [325, 9071.77] + - - [64, 1152, 1, 448] + - [353, 2564.45] + - - [49, 832, 32, 128] + - [316, 4733.16] + - - [784, 256, 32, 128] + - [329, 8471.6] + - - [49, 800, 1, 128] + - [351, 633.535] + - - [196, 512, 32, 32] + - [313, 4354.26] + - - [1225, 384, 32, 96] + - [330, 8751.63] + - - [5041, 576, 1, 96] + - [312, 7067.63] + - - [49, 832, 32, 48] + - [345, 3316.72] + - - [3136, 64, 64, 256] + - [367, 9721.9] + - - [5329, 160, 32, 64] + - [369, 8159.84] + - - [1225, 288, 32, 48] + - [359, 6673.65] + - - [4096, 9216, 1, 512] + - [341, 10116.9] + - - [196, 480, 32, 192] + - [370, 6388.46] + - - [64, 1152, 1, 256] + - [357, 1982.6] + - - [3136, 1024, 1, 512] + - [325, 8745.57] + - - [49, 832, 32, 32] + - [344, 2717.87] + - - [784, 192, 32, 64] + - [308, 7216.32] + - - [289, 1024, 32, 128] + - [331, 7970.5] + - - [289, 768, 32, 192] + - [375, 8327.27] + - - [289, 1120, 1, 192] + - [315, 3716.9] + - - [196, 512, 32, 112] + - [321, 6252.81] + - - [1001, 2048, 1, 32] + - [309, 4000.09] + - - [1225, 288, 32, 64] + - [369, 7208.04] + - - [196, 600, 1, 64] + - [348, 1093.95] + - - [1225, 384, 32, 192] + - [330, 9332.66] + - - [50176, 256, 1, 512] + - [340, 9833.54] + - - [196, 512, 32, 160] + - [367, 6614.34] + - - [4096, 4096, 1, 512] + - [338, 10032.2] + - - [49, 832, 32, 192] + - [316, 5244.53] + - - [1225, 256, 32, 64] + - [306, 7972.35] + - - [64, 2048, 32, 320] + - [334, 9404.27] + - - [196, 480, 32, 16] + - [360, 2724.49] + - - [1225, 256, 32, 48] + - [308, 7100.38] + - - [64, 1280, 32, 448] + - [333, 9344.41] + - - [1225, 1200, 1, 64] + - [302, 5157.89] + - - [1225, 384, 32, 64] + - [306, 8219.96] + - - [12544, 512, 1, 1024] + - [325, 9672.72] + - - [64, 1280, 32, 192] + - [321, 8525.01] + - - [196, 512, 32, 64] + - [306, 5489.34] + - - [289, 1792, 1, 256] + - [324, 4831.61] + - - [196, 528, 32, 256] + - [342, 6453.82] + - - [49, 512, 64, 2048] + - [377, 7548.98] + - - [64, 2048, 32, 192] + - [329, 8955.81] + - - [784, 512, 64, 128] + - [329, 9160.73] + - - [784, 128, 64, 512] + - [336, 9280.69] + - - [196, 528, 32, 160] + - [370, 6161.15] + - - [1225, 192, 32, 48] + - [306, 7236.92] + - - [64, 1728, 1, 192] + - [356, 2480.57] + - - [1001, 2048, 1, 64] + - [382, 5714.42] + - - [5329, 64, 128, 80] + - [389, 8835.29] + - - [64, 1280, 128, 448] + - [387, 10020.5] + - - [289, 768, 128, 128] + - [390, 8542.71] + - - [1225, 192, 128, 64] + - [379, 8444.77] + - - [1225, 288, 128, 48] + - [392, 7244.66] + - - [289, 768, 128, 192] + - [394, 8794.49] + - - [289, 768, 128, 160] + - [391, 8705.33] + - - [64, 2048, 128, 192] + - [385, 9780.26] + - - [64, 1280, 128, 384] + - [388, 9950.9] + - - [1225, 256, 128, 48] + - [380, 8273.61] + - - [1225, 192, 128, 48] + - [380, 8140.32] + - - [1225, 288, 128, 64] + - [392, 7886.21] + - - [64, 1280, 128, 320] + - [384, 9894.56] + - - [1225, 256, 128, 64] + - [385, 8572.51] + - - [1001, 2048, 1, 128] + - [386, 7289.06] + - - [1225, 192, 128, 32] + - [381, 7104.57] + - - [64, 1280, 128, 192] + - [393, 9642.08] + - - [1001, 1536, 1, 64] + - [383, 5146.56] - null diff --git a/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bljk_SB.yaml index 32375b607..1d7b68389 100644 --- a/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bljk_SB.yaml @@ -98602,6 +98602,12714 @@ _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 621 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 622 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2112 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 623 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 624 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 625 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2112 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 626 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 627 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3088 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 628 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 629 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 630 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 631 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 632 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3136 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 633 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 634 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 635 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 636 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 637 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 638 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 639 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 640 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 641 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 642 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 643 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 644 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 645 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 646 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 647 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 648 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 649 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 650 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 651 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1824 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 652 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1824 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 653 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 800 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 654 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1680 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 192 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 655 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 656 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 657 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 1 + LSPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1296 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 658 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB2_PGR0_PLR1_TT8_4_USFGRO1_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 1 + LSPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1312 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 659 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 1 + LSPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1312 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 660 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 661 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR0_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 662 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 663 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 664 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 665 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 32 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 666 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 667 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 668 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 669 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 670 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 671 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 672 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 673 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 674 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 675 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 676 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW2_GSU8_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 677 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 678 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 679 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 680 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 681 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 682 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 683 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 684 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 685 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 686 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 687 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 688 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 689 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 690 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 691 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 692 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 693 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 694 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - [2, 3, 0, 1] - - - [704, 1024, 1, 128] - [102, 3019.46] @@ -101003,36 +113711,18 @@ - [101, 4291.65] - - [3136, 64, 128, 64] - [183, 8175.06] - - - [784, 512, 64, 128] - - [181, 8378.34] - - - [3136, 256, 64, 64] - - [184, 8506.65] - - - [12544, 1024, 1, 256] - - [177, 8927.93] - - [784, 128, 128, 512] - [182, 8190.53] - - [784, 512, 256, 128] - [180, 8637.14] - - - [3136, 64, 64, 256] - - [179, 8782.93] - - - [3136, 512, 1, 2048] - - [176, 7298.32] - - - [12544, 256, 1, 1024] - - [188, 7667.25] - - - [3136, 2048, 1, 512] - - [187, 8447.22] - - [3136, 256, 256, 64] - [180, 8663.08] - - [3136, 64, 128, 256] - [178, 8943.46] - - - [784, 128, 64, 512] - - [186, 8006.27] - - [3136, 64, 256, 64] - [183, 8267.12] - - [784, 512, 128, 128] - [180, 8564.25] - - - [3136, 64, 64, 64] - - [183, 8009.35] - - [784, 128, 256, 512] - [184, 8377.06] - - [3136, 64, 256, 256] @@ -101117,8 +113807,6 @@ - [213, 4032.98] - - [1024, 256, 1, 4096] - [201, 7326.3] - - - [4096, 512, 1, 4096] - - [193, 9471.97] - - [1024, 200, 1, 2048] - [194, 5530.46] - - [2048, 1024, 1, 512] @@ -104835,4 +117523,240 @@ - [608, 9352.16] - - [256, 8976, 1, 44505] - [612, 8430.23] + - - [6272, 256, 1, 528] + - [664, 7389.94] + - - [3136, 2048, 1, 1024] + - [645, 9657.94] + - - [6272, 112, 1, 512] + - [643, 5931.09] + - - [2048, 320, 1, 1280] + - [663, 7772.99] + - - [289, 256, 1, 1568] + - [684, 3718.17] + - - [3136, 64, 64, 64] + - [623, 8201.15] + - - [50176, 128, 1, 256] + - [646, 8908.58] + - - [5329, 64, 1, 448] + - [629, 4602.2] + - - [289, 192, 1, 1344] + - [681, 3452.59] + - - [12544, 1024, 1, 256] + - [646, 9742.64] + - - [784, 64, 32, 192] + - [622, 6844.61] + - - [6272, 64, 1, 480] + - [630, 5562.24] + - - [196, 128, 1, 800] + - [672, 1639.74] + - - [64, 512, 1, 1344] + - [671, 2313.04] + - - [6272, 64, 1, 512] + - [629, 5609.19] + - - [6272, 160, 1, 528] + - [630, 6149.7] + - - [289, 160, 32, 768] + - [657, 6637.82] + - - [12544, 256, 1, 1024] + - [664, 8790.46] + - - [289, 224, 1, 1568] + - [684, 3270.17] + - - [5329, 64, 32, 160] + - [637, 9091.04] + - - [5329, 96, 1, 576] + - [664, 5555.66] + - - [3025, 64, 1, 363] + - [682, 4392.3] + - - [784, 32, 32, 192] + - [653, 5633.8] + - - [3136, 512, 1, 1024] + - [649, 7553.14] + - - [6272, 16, 1, 480] + - [684, 3219.85] + - - [1225, 64, 32, 288] + - [644, 8240.58] + - - [64, 256, 1, 1536] + - [677, 1456.36] + - - [289, 192, 32, 768] + - [656, 7372.8] + - - [2048, 448, 1, 1280] + - [639, 8403.01] + - - [3136, 2048, 1, 512] + - [638, 9486.31] + - - [289, 256, 1, 2016] + - [684, 3876.08] + - - [289, 384, 32, 1024] + - [623, 7350.54] + - - [1568, 32, 1, 832] + - [673, 2717.87] + - - [3136, 64, 32, 64] + - [626, 7657.26] + - - [289, 160, 1, 1120] + - [680, 2826.9] + - - [6272, 128, 1, 528] + - [634, 6926.26] + - - [21609, 32, 1, 288] + - [635, 3698.9] + - - [1225, 192, 1, 1728] + - [668, 7309.81] + - - [4096, 512, 1, 4096] + - [651, 10272.1] + - - [64, 256, 1, 1152] + - [677, 1387.82] + - - [6272, 96, 1, 480] + - [665, 6371.56] + - - [784, 96, 1, 800] + - [685, 3330.27] + - - [2048, 448, 1, 2048] + - [639, 8622.65] + - - [784, 96, 32, 192] + - [654, 7092.36] + - - [3136, 64, 64, 256] + - [647, 9579.16] + - - [289, 224, 1, 1344] + - [684, 3180.01] + - - [1001, 512, 1, 4096] + - [625, 8195.07] + - - [2048, 192, 1, 1280] + - [630, 6120.09] + - - [1225, 64, 32, 256] + - [635, 8076.62] + - - [2048, 256, 1, 1536] + - [625, 8137.7] + - - [1225, 64, 1, 1200] + - [684, 3552.87] + - - [6272, 128, 1, 512] + - [638, 6878.21] + - - [729, 192, 1, 1600] + - [683, 5016.77] + - - [289, 192, 1, 896] + - [681, 3091.87] + - - [1568, 384, 1, 832] + - [664, 6934.62] + - - [784, 16, 32, 192] + - [655, 3380.28] + - - [1568, 256, 1, 832] + - [629, 5980.86] + - - [1568, 48, 1, 832] + - [686, 3275.09] + - - [1568, 192, 1, 832] + - [624, 4441.11] + - - [289, 192, 32, 1024] + - [627, 6563.06] + - - [6272, 32, 1, 528] + - [668, 4998.67] + - - [49, 128, 1, 1200] + - [669, 550.175] + - - [1225, 64, 32, 384] + - [641, 8589.33] + - - [289, 128, 1, 896] + - [680, 2103.1] + - - [1568, 160, 1, 832] + - [668, 6995.05] + - - [1001, 32, 1, 1024] + - [677, 1744.72] + - - [2048, 320, 1, 2048] + - [662, 7118.04] + - - [2048, 384, 1, 1536] + - [625, 8184.01] + - - [50176, 512, 1, 256] + - [637, 9852.4] + - - [289, 256, 1, 1792] + - [686, 3809.75] + - - [64, 448, 1, 1152] + - [678, 2128.23] + - - [5041, 96, 1, 576] + - [663, 5279.3] + - - [6272, 192, 1, 480] + - [625, 7479.65] + - - [784, 32, 32, 256] + - [652, 5708.91] + - - [1001, 32, 1, 2048] + - [679, 2141.04] + - - [289, 192, 1, 1120] + - [675, 3277.77] + - - [6272, 32, 1, 512] + - [667, 4978.7] + - - [289, 384, 1, 3456] + - [684, 5904.14] + - - [289, 384, 1, 2592] + - [685, 5707.34] + - - [784, 128, 64, 512] + - [631, 8864.39] + - - [12544, 1024, 1, 512] + - [646, 10008.3] + - - [12544, 256, 1, 512] + - [664, 8628.08] + - - [6272, 24, 1, 512] + - [668, 3568.07] + - - [5041, 192, 1, 720] + - [639, 8424.42] + - - [64, 320, 1, 1728] + - [672, 1469.66] + - - [784, 128, 32, 256] + - [640, 8104.14] + - - [289, 96, 1, 864] + - [678, 1838.25] + - - [1225, 32, 32, 192] + - [659, 5949.72] + - - [1568, 128, 1, 832] + - [667, 5718.69] + - - [289, 128, 32, 768] + - [625, 7289.25] + - - [3136, 256, 64, 64] + - [633, 9103.92] + - - [196, 64, 1, 800] + - [671, 915.62] + - - [4096, 512, 1, 9216] + - [648, 10351.4] + - - [12544, 64, 1, 147] + - [638, 5069.33] + - - [784, 32, 1, 400] + - [669, 1140.36] + - - [6272, 160, 1, 512] + - [629, 6140.08] + - - [1225, 48, 32, 288] + - [635, 5978.61] + - - [64, 320, 1, 2880] + - [676, 1920.0] + - - [1225, 64, 32, 192] + - [629, 7641.01] + - - [1001, 32, 1, 1536] + - [677, 2084.79] + - - [784, 64, 32, 256] + - [621, 6990.51] + - - [64, 384, 1, 1152] + - [678, 1862.6] + - - [784, 512, 64, 128] + - [632, 9025.95] + - - [3136, 512, 1, 2048] + - [650, 7764.3] + - - [6272, 144, 1, 512] + - [625, 5574.04] + - - [1225, 192, 32, 384] + - [639, 9373.83] + - - [64, 192, 1, 1728] + - [677, 1206.46] + - - [8192, 320, 1, 1280] + - [691, 9875.92] + - - [8192, 320, 1, 2048] + - [694, 9745.7] + - - [8192, 384, 1, 1280] + - [691, 10046.2] + - - [8192, 192, 1, 1280] + - [694, 9950.9] + - - [8192, 192, 1, 2048] + - [690, 9559.67] + - - [8192, 384, 1, 2048] + - [692, 9945.74] + - - [8192, 448, 1, 2048] + - [693, 9908.51] + - - [1001, 64, 1, 1536] + - [687, 3649.94] + - - [8192, 448, 1, 1280] + - [691, 9981.35] + - - [1001, 64, 1, 2048] + - [688, 3580.87] + - - [1001, 128, 1, 2048] + - [689, 5587.87] - null diff --git a/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Alik_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Alik_Bljk_SB.yaml index 84d40bfca..09b032b57 100644 --- a/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Alik_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Alik_Bljk_SB.yaml @@ -65672,6 +65672,18240 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 413 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 414 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 415 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 416 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 417 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 418 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 419 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 420 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 421 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 422 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 423 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS0_FL1_GRVW2_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 424 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 425 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 426 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 427 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 96 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 24 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 428 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT6_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 96 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 24 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 429 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT6_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 430 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 431 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 432 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 433 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 434 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 435 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT8_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 436 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 437 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 438 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS0_FL1_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT8_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 439 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 440 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 441 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 442 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 443 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 444 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 445 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 446 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 447 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 448 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 449 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_FL0_GRVW1_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 450 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 451 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 452 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 453 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 454 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 455 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 456 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 457 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 458 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 459 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 460 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 461 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 462 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3200 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 128 + LdsOffsetB_Blk: 2176 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 64 + MacroTileA: 8 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 463 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 464 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 465 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 466 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 467 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 468 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 469 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 470 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 471 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 472 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 473 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 474 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 475 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 476 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 477 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 478 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 479 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 480 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA4_LPB4_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 481 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT4_2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 482 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 483 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 484 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 4 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 485 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA4_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 486 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT8_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 487 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 488 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 489 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 490 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 491 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 492 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 493 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 494 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 495 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 496 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR0_TT4_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 497 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW1_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 498 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB0_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 499 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 500 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 501 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 502 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 503 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 504 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 505 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 506 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 507 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU4_LPA0_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 508 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 509 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 510 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 511 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 512 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 513 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3200 + LdsOffsetA: 0 + LdsOffsetB: 2112 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 514 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 515 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT8_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 516 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 517 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 518 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 - [2, 3, 0, 1] - - - [1024, 128, 1, 128] - [12, 896.219] @@ -67927,24 +86161,10 @@ - [95, 6513.35] - - [4288, 1024, 1, 128] - [80, 4291.67] - - - [512, 2048, 1, 49] - - [126, 4554.98] - - - [512, 128, 1, 784] - - [119, 3195.29] - - - [2048, 512, 1, 49] - - [127, 4253.33] - - - [1024, 256, 1, 196] - - [123, 4039.33] - - [256, 64, 1, 3136] - [121, 3015.27] - - [256, 1024, 1, 196] - [125, 4225.35] - - - [64, 256, 1, 3136] - - [122, 3058.35] - - - [128, 512, 1, 784] - - [120, 3380.28] - - - [64, 64, 1, 3136] - - [124, 1372.34] - - [1024, 1024, 1, 3328] - [237, 8705.0] - - [2048, 200, 1, 3200] @@ -68777,8 +86997,6 @@ - [231, 5745.62] - - [1024, 200, 1, 1280] - [223, 4446.13] - - - [4096, 512, 1, 4096] - - [141, 9264.39] - - [2048, 256, 1, 3200] - [231, 7842.75] - - [2048, 512, 1, 15360] @@ -69279,64 +87497,28 @@ - [237, 6628.17] - - [4096, 1024, 1, 6144] - [139, 9592.98] - - - [1280, 384, 1, 64] - - [270, 3196.88] - - [256, 64, 1, 1225] - [271, 1194.67] - - [2048, 320, 1, 64] - [273, 3449.26] - - - [256, 48, 1, 1225] - - [264, 913.398] - - - [2048, 192, 1, 64] - - [263, 2516.58] - - [1024, 128, 1, 289] - [277, 2869.68] - - - [1280, 192, 1, 64] - - [256, 1872.46] - - - [192, 32, 1, 1225] - - [261, 505.806] - - - [1280, 448, 1, 64] - - [257, 3078.87] - - [384, 64, 1, 1225] - [262, 1511.33] - - [2048, 384, 1, 64] - [275, 3836.25] - - - [288, 48, 1, 1225] - - [258, 1032.59] - - [64, 80, 1, 5329] - [274, 888.167] - - [1024, 384, 1, 289] - [268, 4291.52] - - [2048, 448, 1, 64] - [267, 3783.52] - - - [1280, 320, 1, 64] - - [273, 2776.95] - - - [192, 64, 1, 1225] - - [258, 926.897] - - - [384, 192, 1, 1225] - - [269, 2560.0] - - - [1536, 256, 1, 64] - - [276, 2621.44] - - - [192, 48, 1, 1225] - - [261, 698.614] - - - [768, 128, 1, 289] - - [278, 2291.12] - - - [1024, 256, 1, 289] - - [276, 4064.36] - - [768, 192, 1, 289] - [272, 2690.33] - - - [1536, 384, 1, 64] - - [259, 3145.73] - - [288, 64, 1, 1225] - [261, 1142.67] - - - [1024, 192, 1, 289] - - [266, 3243.13] - - [384, 96, 1, 1225] - [279, 1844.71] - - - [160, 64, 1, 5329] - - [265, 1564.48] - - - [768, 160, 1, 289] - - [260, 2386.58] - - [1024, 3392, 1, 4096] - [305, 8502.92] - - [1024, 3301, 1, 4096] @@ -71521,4 +89703,274 @@ - [372, 5309.25] - - [2816, 8976, 1, 256] - [383, 9409.56] + - - [1728, 320, 1, 64] + - [419, 3205.57] + - - [1152, 128, 1, 784] + - [466, 3498.96] + - - [576, 96, 1, 5329] + - [452, 3947.92] + - - [864, 96, 1, 1225] + - [473, 3009.67] + - - [256, 128, 1, 784] + - [463, 1536.49] + - - [1440, 320, 1, 196] + - [416, 4824.62] + - - [192, 48, 1, 1225] + - [494, 820.465] + - - [2592, 384, 1, 289] + - [434, 7353.01] + - - [192, 80, 36, 10368] + - [484, 5360.04] + - - [896, 192, 1, 289] + - [451, 3076.56] + - - [768, 128, 1, 289] + - [476, 2351.81] + - - [64, 256, 1, 3136] + - [502, 1809.16] + - - [1280, 384, 1, 64] + - [416, 3171.1] + - - [512, 144, 1, 196] + - [474, 1445.07] + - - [1344, 192, 1, 289] + - [457, 4376.52] + - - [288, 64, 1, 21609] + - [468, 3396.12] + - - [400, 32, 1, 784] + - [495, 922.353] + - - [288, 32, 1, 21609] + - [506, 2816.01] + - - [1280, 448, 1, 64] + - [419, 3253.56] + - - [3456, 256, 1, 169] + - [431, 5822.44] + - - [2304, 256, 1, 196] + - [429, 4931.98] + - - [384, 192, 1, 1225] + - [477, 2720.39] + - - [832, 48, 1, 49] + - [472, 344.518] + - - [832, 192, 1, 49] + - [454, 1099.36] + - - [1280, 192, 1, 64] + - [455, 2069.56] + - - [192, 32, 1, 784] + - [494, 459.627] + - - [288, 48, 1, 1225] + - [501, 1176.0] + - - [512, 112, 1, 196] + - [469, 1277.21] + - - [224, 192, 36, 2592] + - [486, 7369.56] + - - [528, 32, 1, 196] + - [460, 440.374] + - - [192, 128, 36, 1568] + - [485, 8245.76] + - - [4032, 384, 1, 64] + - [430, 5898.24] + - - [576, 64, 1, 3136] + - [475, 2671.11] + - - [2048, 32, 1, 1001] + - [477, 2323.0] + - - [480, 64, 1, 196] + - [462, 752.64] + - - [512, 256, 1, 196] + - [464, 2528.55] + - - [864, 96, 1, 289] + - [474, 1958.4] + - - [896, 128, 1, 289] + - [477, 2725.73] + - - [192, 64, 1, 784] + - [492, 898.675] + - - [1200, 64, 1, 1225] + - [476, 2780.14] + - - [1296, 288, 1, 196] + - [415, 3826.18] + - - [576, 96, 1, 5041] + - [456, 3795.58] + - - [1024, 256, 1, 289] + - [445, 4488.13] + - - [1024, 2048, 1, 49] + - [435, 5077.1] + - - [192, 64, 36, 6272] + - [479, 7514.98] + - - [4096, 512, 1, 4096] + - [441, 10276.0] + - - [192, 32, 1, 1225] + - [495, 556.686] + - - [1024, 256, 1, 196] + - [455, 3892.44] + - - [1120, 192, 1, 289] + - [444, 3752.81] + - - [400, 48, 1, 196] + - [469, 480.0] + - - [1728, 224, 1, 1225] + - [422, 5575.77] + - - [800, 96, 1, 784] + - [476, 2668.94] + - - [1152, 384, 1, 64] + - [426, 3077.34] + - - [4608, 512, 1, 49] + - [433, 4676.6] + - - [1792, 256, 1, 289] + - [426, 5345.94] + - - [864, 128, 1, 784] + - [476, 3816.2] + - - [1728, 384, 1, 169] + - [428, 5191.68] + - - [480, 16, 1, 196] + - [497, 241.231] + - - [1568, 256, 1, 289] + - [416, 4723.41] + - - [1152, 448, 1, 64] + - [422, 3356.72] + - - [512, 64, 1, 196] + - [461, 802.816] + - - [1344, 224, 1, 289] + - [416, 3519.63] + - - [9216, 512, 1, 4096] + - [439, 9146.02] + - - [27, 32, 1, 22201] + - [507, 264.356] + - - [1152, 192, 1, 784] + - [446, 4904.08] + - - [1536, 256, 1, 64] + - [414, 2578.47] + - - [800, 128, 1, 196] + - [476, 1991.11] + - - [800, 64, 1, 196] + - [471, 1150.83] + - - [864, 208, 1, 196] + - [448, 2684.72] + - - [1440, 320, 1, 49] + - [417, 2313.44] + - - [512, 128, 1, 784] + - [467, 2780.32] + - - [720, 192, 1, 5041] + - [442, 5410.46] + - - [256, 64, 1, 784] + - [499, 1163.5] + - - [256, 48, 1, 1225] + - [494, 1075.2] + - - [576, 192, 1, 3136] + - [442, 4833.01] + - - [160, 64, 1, 5329] + - [496, 1753.5] + - - [3456, 384, 1, 289] + - [436, 7341.75] + - - [32, 32, 36, 43808] + - [490, 1378.03] + - - [1344, 512, 1, 64] + - [415, 3822.93] + - - [192, 16, 1, 784] + - [495, 228.073] + - - [3456, 384, 1, 169] + - [432, 6675.02] + - - [1152, 256, 1, 196] + - [425, 3211.26] + - - [1728, 192, 1, 1225] + - [426, 4852.26] + - - [2048, 512, 1, 49] + - [438, 3471.64] + - - [576, 96, 1, 1225] + - [469, 2176.66] + - - [512, 2048, 1, 49] + - [420, 3845.83] + - - [1728, 192, 1, 64] + - [415, 2369.83] + - - [832, 256, 1, 49] + - [445, 1433.6] + - - [512, 128, 1, 196] + - [470, 1459.67] + - - [1200, 128, 1, 49] + - [465, 1069.09] + - - [528, 256, 1, 196] + - [453, 2069.76] + - - [256, 512, 1, 784] + - [476, 4538.89] + - - [480, 192, 1, 196] + - [476, 1792.0] + - - [96, 64, 36, 2592] + - [483, 4845.41] + - - [96, 96, 36, 2592] + - [488, 5111.53] + - - [1024, 192, 1, 289] + - [450, 3431.14] + - - [1536, 384, 1, 64] + - [421, 3166.84] + - - [192, 96, 1, 784] + - [461, 881.14] + - - [2048, 192, 1, 64] + - [418, 2330.17] + - - [192, 64, 1, 1225] + - [500, 1100.35] + - - [512, 32, 1, 196] + - [491, 477.867] + - - [128, 96, 36, 1568] + - [487, 6649.09] + - - [528, 128, 1, 196] + - [473, 1403.23] + - - [128, 512, 1, 784] + - [463, 2237.81] + - - [128, 128, 36, 3136] + - [480, 6538.77] + - - [528, 160, 1, 196] + - [477, 1642.67] + - - [448, 64, 1, 5329] + - [452, 3264.81] + - - [1280, 320, 1, 64] + - [416, 2776.95] + - - [1792, 320, 1, 289] + - [428, 5204.9] + - - [2880, 320, 1, 64] + - [424, 4336.94] + - - [147, 64, 1, 12544] + - [505, 2430.27] + - - [4096, 512, 1, 1001] + - [440, 9618.99] + - - [1536, 32, 1, 1001] + - [477, 1757.18] + - - [512, 160, 1, 196] + - [473, 1592.89] + - - [768, 160, 1, 289] + - [474, 2757.17] + - - [1728, 384, 1, 49] + - [426, 3102.49] + - - [64, 32, 36, 43808] + - [481, 2626.43] + - - [64, 64, 1, 3136] + - [493, 610.506] + - - [256, 32, 1, 784] + - [494, 612.837] + - - [480, 96, 1, 196] + - [469, 1055.1] + - - [1024, 32, 1, 1001] + - [459, 1188.43] + - - [832, 160, 1, 49] + - [474, 959.247] + - - [512, 1024, 1, 196] + - [417, 4978.7] + - - [96, 64, 36, 10368] + - [511, 5000.95] + - - [384, 448, 36, 512] + - [516, 8903.0] + - - [2048, 64, 1, 1001] + - [509, 4385.13] + - - [224, 192, 36, 5184] + - [515, 7487.81] + - - [2048, 128, 1, 1001] + - [508, 5764.63] + - - [96, 96, 36, 10368] + - [517, 5275.21] + - - [192, 80, 36, 20736] + - [513, 5409.4] + - - [96, 64, 36, 5184] + - [511, 4911.83] + - - [1536, 64, 1, 1001] + - [510, 3162.03] + - - [96, 64, 36, 20736] + - [512, 5034.33] + - - [384, 448, 36, 256] + - [514, 8815.87] + - - [96, 96, 36, 5184] + - [518, 5236.02] - null diff --git a/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bjlk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bjlk_SB.yaml index b440f65ce..520f17834 100644 --- a/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bjlk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bjlk_SB.yaml @@ -47907,6 +47907,29261 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 300 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 301 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 302 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 303 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 304 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 305 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 306 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 307 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 308 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 309 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 310 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 311 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 312 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 313 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 314 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 315 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 316 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 317 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 318 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 319 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 320 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 321 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 322 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 323 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 324 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 325 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 326 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_6_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 327 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 328 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 329 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 330 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 331 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 332 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 333 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 334 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 335 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_DTL0_EPS0_FL0_GRVW4_PGR0_PLR0_TT4_8_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 336 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 337 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 338 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 339 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 340 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 341 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 342 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 343 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 344 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 345 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 346 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 347 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 348 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 349 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 350 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 351 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 352 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 353 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 354 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 355 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 356 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 357 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 358 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 359 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 360 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 361 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 362 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 363 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 364 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 365 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 366 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 367 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 368 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 369 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 370 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 371 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 372 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL1_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 32 + LVCB: 128 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: true + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 373 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x128x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 374 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 375 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 376 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 377 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG8_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 378 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_8_USFGRO0_VW4_WG8_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 379 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 380 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 381 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 382 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 383 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 384 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 385 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 386 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 387 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 388 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 389 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 390 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 391 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 392 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 393 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 394 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 395 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id007 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 396 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 397 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 398 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 399 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 400 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 401 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 402 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 403 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 404 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 405 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 406 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 407 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 408 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 409 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 410 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 411 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 412 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 413 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id007 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 414 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 384 + LdsNumElementsAlignedB: 384 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 384 + LdsOffsetB_Blk: 1408 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 3 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 415 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x24_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 416 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 417 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 418 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 419 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 420 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 421 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 422 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 423 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 424 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 425 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 426 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id012 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 427 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id012 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 428 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 429 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 430 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 431 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 432 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 433 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 434 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id012 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 435 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 436 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 437 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 438 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 439 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 440 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 441 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 442 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 443 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 444 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 445 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id012 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 446 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 447 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 448 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 449 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 450 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 451 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 452 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 453 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 454 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 455 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 456 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 457 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 458 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 459 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 460 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL0_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 461 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id017 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 462 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 463 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 464 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id017 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 465 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 466 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 467 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id020 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 468 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id020 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 469 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 470 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id023 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 471 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id024 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 472 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 473 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id026 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 474 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id020 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 475 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 476 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id023 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 477 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id024 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 478 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 479 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id026 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 480 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id020 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 4 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 481 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id027 + WorkGroupMapping: 1 + WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -47914,7 +77169,752 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 2 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 482 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id027 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 483 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 484 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id031 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 485 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 4 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 486 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id030 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -47924,9 +77924,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -47935,46 +77935,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + KernelLanguage: Assembly + LSCA: 8 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 2 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 832 - LdsNumElementsAlignedA: 256 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 2 + MacroTile0: 8 MacroTile1: 8 - MacroTileA: 32 + MacroTileA: 8 MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -47983,13 +77983,162 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 487 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 4 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 64 PerformanceSyncLocation: -1 @@ -48035,35 +78184,333 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 300 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 - SubGroup0: 16 + SolutionIndex: 488 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id030 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 489 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 SubGroup1: 4 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id001 + ThreadTile: *id028 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id007 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 490 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -48073,10 +78520,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -48084,47 +78531,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 LSPA: 8 LSPB: 8 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -48132,8 +78579,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -48184,48 +78631,48 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 301 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 491 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id001 + ThreadTile: *id028 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id002 + WorkGroup: *id029 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -48233,26 +78680,26 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + KernelLanguage: Assembly + LSCA: 16 LSCB: 16 - LSPA: 4 - LSPB: 8 + LSPA: 16 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 896 + LdsNumElements: 1024 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 512 LdsOffsetB: 256 @@ -48262,17 +78709,17 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -48281,15 +78728,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -48333,35 +78780,34 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 302 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SolutionIndex: 492 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id003 - ThreadTile0: 4 + ThreadTile: *id028 + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id002 + WorkGroup: *id031 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -48371,41 +78817,40 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 2 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -48419,10 +78864,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -48430,15 +78875,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -48482,35 +78925,35 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 303 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionIndex: 493 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id032 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id002 + VectorWidth: 2 + WorkGroup: *id035 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -48520,9 +78963,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -48530,31 +78972,31 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -48568,10 +79010,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -48579,14 +79021,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -48631,47 +79071,46 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 304 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SolutionIndex: 494 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false + DirectToLdsB: true DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -48679,31 +79118,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 8 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 256 + LSPA: 8 + LSPB: 1 LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LVCB: 256 + LVPA: 8 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2304 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -48713,14 +79148,14 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprB: true LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -48728,20 +79163,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -48780,79 +79213,74 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 305 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SolutionIndex: 495 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: *id032 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 + VectorWidth: 4 + WorkGroup: *id033 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false + DirectToLdsB: true DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 256 LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSPB: 1 + LVCA: 32 + LVCB: 256 + LVPA: 8 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2304 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -48862,14 +79290,14 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprB: true LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -48877,20 +79305,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 8 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -48929,79 +79355,74 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 306 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SolutionIndex: 496 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM08 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 32 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: *id033 WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false + DirectToLdsB: true DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 32 - LSCB: 16 - LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 4 + LSCB: 256 + LSPA: 8 + LSPB: 1 + LVCA: 32 + LVCB: 256 + LVPA: 8 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2304 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 LdsOffsetB: 256 - LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -49011,14 +79432,14 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprB: true LoopDoWhile: false LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 256 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 256 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49026,20 +79447,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -49078,79 +79497,74 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 307 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SolutionIndex: 497 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM64 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SubGroupB: 32 + ThreadTile: *id032 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id033 + WorkGroupMapping: 64 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -49163,11 +79577,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49175,20 +79589,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -49227,47 +79639,46 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 308 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 + SolutionIndex: 498 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + ThreadTile: *id032 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id002 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -49275,31 +79686,31 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -49312,11 +79723,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49324,15 +79735,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -49376,35 +79785,35 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 309 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SolutionIndex: 499 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_16_01_WGM01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id035 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -49414,37 +79823,36 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 4 + LSCB: 64 + LSPA: 8 LSPB: 8 - LVCA: 32 + LVCA: 16 LVCB: 16 LVPA: 2 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -49463,9 +79871,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49473,13 +79881,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 PerformanceSyncLocation: -1 @@ -49525,79 +79931,78 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 310 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SolutionIndex: 500 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + ThreadTile: *id032 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 8 LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCB: 32 + LVPA: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3200 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -49610,11 +80015,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49622,15 +80027,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -49674,35 +80077,35 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 311 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 501 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id006 + VectorWidth: 4 + WorkGroup: *id033 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -49712,41 +80115,36 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 LSPB: 8 - LVCA: 8 - LVCB: 8 + LVCA: 16 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -49760,10 +80158,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49771,20 +80169,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -49822,36 +80218,36 @@ TransposeA: false TransposeB: true UseBeta: true - UseInitialStrides: false - SolutionIndex: 312 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + UseInitialStrides: false + SolutionIndex: 502 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 32 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: *id033 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -49861,41 +80257,40 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 - LSPA: 4 - LSPB: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 2 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -49909,10 +80304,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49920,15 +80315,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -49972,35 +80365,35 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 313 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SolutionIndex: 503 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id036 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: *id035 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -50010,8 +80403,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -50028,23 +80420,19 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -50058,10 +80446,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50069,20 +80457,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -50121,35 +80507,35 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 314 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionIndex: 504 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id032 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id002 + WorkGroup: *id035 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -50159,37 +80545,36 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 4 + LSCB: 64 + LSPA: 16 LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -50208,9 +80593,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50218,15 +80603,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -50270,35 +80653,35 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 315 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SolutionIndex: 505 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM08 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SubGroupB: 16 + ThreadTile: *id036 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id035 + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -50308,8 +80691,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -50326,23 +80708,19 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 + LSCB: 128 + LSPA: 16 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 4 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -50357,9 +80735,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50367,20 +80745,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -50419,79 +80795,76 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 316 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SolutionIndex: 506 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM08 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SubGroupB: 16 + ThreadTile: *id032 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id004 - WorkGroupMapping: 1 + WorkGroup: *id035 + WorkGroupMapping: 8 WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 8 + LSCB: 32 LSPA: 4 - LSPB: 16 - LVCA: 32 + LSPB: 8 + LVCA: 16 LVCB: 8 - LVPA: 2 - LVPB: 16 + LVPA: 1 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3200 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -50504,11 +80877,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50516,20 +80889,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -50545,6 +80921,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -50554,6 +80931,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -50568,75 +80946,85 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 317 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 507 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id006 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 LSPA: 4 - LSPB: 16 + LSPB: 4 LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 8 + LVCB: 16 + LVPA: 1 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -50653,11 +81041,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50665,15 +81053,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -50694,6 +81085,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -50703,6 +81095,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -50717,79 +81110,85 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 318 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 508 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id007 + VectorWidth: 4 + WorkGroup: [8, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 LSPB: 8 - LVCA: 8 + LVCA: 16 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 1 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -50802,11 +81201,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50814,20 +81213,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -50843,6 +81245,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -50852,6 +81255,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -50866,79 +81270,89 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 319 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 509 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 384 - LdsNumElementsAlignedB: 384 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 - LdsOffsetB: 384 - LdsOffsetB_Blk: 1408 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -50951,11 +81365,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50963,15 +81377,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -50992,6 +81409,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -51001,6 +81419,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -51015,96 +81434,102 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 320 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x24_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 510 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2560 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -51112,20 +81537,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -51141,6 +81569,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -51150,6 +81579,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -51164,96 +81594,106 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 321 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 511 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -51261,15 +81701,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -51290,6 +81733,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -51299,6 +81743,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -51313,96 +81758,102 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 322 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 512 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2560 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -51410,20 +81861,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -51439,6 +81893,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -51448,6 +81903,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -51462,96 +81918,102 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 323 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 513 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -51559,21 +82021,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -51588,6 +82053,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -51597,6 +82063,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -51611,33 +82078,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 324 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 514 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -51649,58 +82126,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 64 + LSCB: 64 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -51708,20 +82181,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -51737,6 +82213,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -51746,6 +82223,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -51760,33 +82238,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 325 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 515 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -51799,57 +82287,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 + LSCA: 64 + LSCB: 64 + LSPA: 16 LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -51857,15 +82345,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -51886,6 +82377,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -51895,6 +82387,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -51909,33 +82402,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 326 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 516 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -51947,8 +82450,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -51958,7 +82461,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -51966,38 +82469,34 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 32 - LSPA: 16 + LSPA: 4 LSPB: 16 - LVCA: 8 + LVCA: 32 LVCB: 8 - LVPA: 4 + LVPA: 1 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2560 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -52006,20 +82505,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -52035,6 +82537,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -52044,6 +82547,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -52058,17 +82562,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 327 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 517 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -52076,15 +82587,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -52096,8 +82610,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -52107,7 +82621,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -52116,38 +82630,34 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 + LSCB: 64 + LSPA: 16 LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 2 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -52157,19 +82667,22 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -52184,6 +82697,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -52193,6 +82707,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -52202,22 +82717,29 @@ Tensor1: 1 TileA: 0 TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseInitialStrides: false - SolutionIndex: 328 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 8 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 518 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -52225,15 +82747,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -52245,58 +82770,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 64 + LSCB: 64 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -52304,20 +82825,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -52333,6 +82857,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -52342,6 +82867,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -52356,33 +82882,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 329 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 519 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -52395,57 +82931,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 64 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -52453,15 +82989,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -52482,6 +83021,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -52491,6 +83031,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -52505,39 +83046,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 330 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 520 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -52545,7 +83096,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -52553,27 +83104,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -52583,7 +83134,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -52591,10 +83142,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -52602,26 +83153,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -52631,6 +83188,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -52640,6 +83198,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -52654,47 +83213,55 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 331 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 521 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id012 + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -52702,37 +83269,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -52741,9 +83304,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -52751,26 +83314,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -52780,6 +83349,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -52789,6 +83359,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -52803,47 +83374,55 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 332 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 522 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id012 + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -52851,48 +83430,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -52900,26 +83475,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -52929,6 +83510,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -52938,6 +83520,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -52952,39 +83535,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 333 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 523 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -52992,45 +83583,45 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -53038,10 +83629,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -53049,26 +83640,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53078,6 +83675,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -53087,6 +83685,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53101,85 +83700,89 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 334 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 524 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -53187,10 +83790,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -53198,26 +83801,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53227,6 +83836,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -53236,6 +83846,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53250,85 +83861,89 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 335 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 525 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -53336,9 +83951,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -53347,26 +83962,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53376,6 +83997,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -53385,6 +84007,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53399,45 +84022,53 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 336 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 526 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -53447,8 +84078,8 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -53456,38 +84087,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 LVCB: 16 - LVPA: 16 - LVPB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -53496,26 +84123,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53525,6 +84158,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -53534,6 +84168,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53548,47 +84183,55 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 337 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 527 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -53596,47 +84239,43 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 LVCB: 16 - LVPA: 8 - LVPB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -53645,26 +84284,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53674,6 +84319,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -53683,6 +84329,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53697,39 +84344,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 338 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 528 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -53737,7 +84392,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -53745,27 +84400,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -53775,7 +84430,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -53783,10 +84438,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -53794,26 +84449,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53823,6 +84484,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -53832,6 +84494,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53846,39 +84509,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 339 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 529 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id012 + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -53894,8 +84565,8 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -53903,39 +84574,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -53943,26 +84614,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53972,6 +84649,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -53981,6 +84659,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53995,85 +84674,89 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 340 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 530 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -54081,9 +84764,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -54092,26 +84775,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54121,6 +84810,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -54130,6 +84820,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54144,45 +84835,53 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 341 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 531 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id008 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -54192,8 +84891,8 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -54201,39 +84900,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54241,26 +84936,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54270,6 +84971,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -54279,6 +84981,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54293,45 +84996,53 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 342 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 532 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -54341,8 +85052,8 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -54350,38 +85061,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 LVCB: 16 - LVPA: 16 - LVPB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -54390,26 +85097,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54419,6 +85132,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -54428,6 +85142,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54442,47 +85157,55 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 343 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 533 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -54490,48 +85213,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: true - InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 32 + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54539,26 +85258,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54568,6 +85293,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -54577,6 +85303,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54591,85 +85318,89 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 344 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 534 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 4 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -54677,9 +85408,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -54688,26 +85419,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54717,6 +85454,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -54726,6 +85464,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54740,95 +85479,99 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 345 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 535 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -54837,26 +85580,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54866,6 +85615,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -54875,6 +85625,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54889,33 +85640,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 346 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 536 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -54927,58 +85686,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54986,26 +85741,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55015,6 +85776,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -55024,6 +85786,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55038,85 +85801,89 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 347 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 537 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -55124,9 +85891,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -55135,26 +85902,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55164,6 +85937,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -55173,6 +85947,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55187,85 +85962,89 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 348 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 538 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id011 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -55284,26 +86063,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55313,6 +86098,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -55322,6 +86108,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55336,92 +86123,96 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 349 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 539 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id011 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -55433,26 +86224,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55462,6 +86259,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -55471,6 +86269,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55485,85 +86284,89 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 350 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 540 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id012 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 LVPA: 2 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -55571,9 +86374,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -55585,23 +86388,29 @@ NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55611,6 +86420,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -55620,6 +86430,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55634,39 +86445,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 351 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 541 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -55674,56 +86493,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 64 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55731,15 +86550,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -55751,6 +86575,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55760,6 +86585,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -55769,6 +86595,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55783,39 +86610,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 352 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 542 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -55832,7 +86667,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -55840,12 +86675,12 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 64 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false @@ -55861,7 +86696,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -55869,10 +86704,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55880,15 +86715,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -55900,6 +86740,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55909,6 +86750,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -55918,6 +86760,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55932,14 +86775,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 353 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 543 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -55950,78 +86800,79 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56029,15 +86880,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -56049,6 +86903,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56058,6 +86913,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -56067,6 +86923,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56075,45 +86932,55 @@ Tensor0: 0 Tensor1: 1 TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseInitialStrides: false - SolutionIndex: 354 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 544 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -56121,56 +86988,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 128 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56178,15 +87045,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -56198,6 +87070,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56207,6 +87080,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -56216,6 +87090,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56230,39 +87105,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 355 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 545 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -56279,7 +87162,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -56287,39 +87170,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56327,15 +87210,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -56347,6 +87235,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56356,6 +87245,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -56365,6 +87255,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56379,17 +87270,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 356 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 546 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -56397,28 +87295,29 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id011 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -56428,7 +87327,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -56436,16 +87335,16 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 @@ -56457,18 +87356,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56476,15 +87375,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -56496,6 +87398,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56505,6 +87408,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -56514,6 +87418,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56528,14 +87433,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 357 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 547 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 @@ -56546,21 +87458,24 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id011 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -56568,35 +87483,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 + LSCA: 128 + LSCB: 128 LSPA: 8 - LSPB: 32 + LSPB: 8 LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 6400 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -56606,18 +87521,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56625,15 +87540,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -56645,6 +87565,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56654,6 +87575,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -56663,6 +87585,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56677,96 +87600,104 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 358 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 548 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56774,15 +87705,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -56794,6 +87728,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56803,6 +87738,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -56812,6 +87748,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56826,39 +87763,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 359 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 549 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -56866,35 +87813,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 64 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -56904,18 +87851,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56923,15 +87870,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -56943,6 +87895,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56952,6 +87905,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -56961,6 +87915,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56975,39 +87930,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 360 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 550 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -57024,7 +87987,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -57032,39 +87995,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -57072,15 +88035,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -57092,6 +88060,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57101,6 +88070,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -57110,6 +88080,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -57124,46 +88095,54 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 361 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 551 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id011 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -57173,7 +88152,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -57181,39 +88160,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -57221,15 +88200,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -57241,6 +88223,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57250,6 +88233,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -57259,6 +88243,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -57273,39 +88258,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 362 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 552 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id011 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -57313,56 +88308,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -57370,8 +88365,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -57379,6 +88374,11 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -57390,6 +88390,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57399,6 +88400,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -57408,6 +88410,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -57422,96 +88425,104 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 363 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 553 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -57519,15 +88530,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -57539,6 +88553,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57548,6 +88563,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -57557,6 +88573,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -57571,46 +88588,56 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 364 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 554 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -57627,7 +88654,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 128 LSCB: 128 LSPA: 8 @@ -57637,13 +88664,13 @@ LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -57656,7 +88683,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 128 MacroTileA: 128 @@ -57670,13 +88697,18 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -57688,6 +88720,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57697,6 +88730,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -57706,6 +88740,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -57720,35 +88755,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 365 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL0_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 555 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id016 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -57758,8 +88801,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -57776,23 +88819,23 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -57806,10 +88849,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -57817,15 +88860,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -57837,6 +88883,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57846,6 +88893,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -57855,6 +88903,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -57869,35 +88918,45 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 366 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 556 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id017 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id016 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -57907,8 +88966,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -57925,23 +88984,24 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 + LVCB: 16 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -57956,9 +89016,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -57966,26 +89026,31 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57995,6 +89060,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -58004,6 +89070,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -58018,79 +89085,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 367 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 557 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id016 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 - LSCB: 64 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 LSPA: 8 - LSPB: 16 + LSPB: 4 LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + LVCB: 64 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -58103,11 +89181,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -58116,18 +89194,24 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -58135,6 +89219,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58144,6 +89229,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -58153,6 +89239,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -58167,75 +89254,84 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 368 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 558 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -58252,10 +89348,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -58264,26 +89360,33 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58293,6 +89396,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -58302,6 +89406,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -58316,79 +89421,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 369 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 559 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id017 - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id016 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 128 - LSPA: 16 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 LSPB: 8 - LVCA: 16 + LVCA: 64 LVCB: 32 - LVPA: 4 - LVPB: 2 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -58401,11 +89515,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -58414,18 +89528,22 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -58433,6 +89551,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58442,6 +89561,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -58451,6 +89571,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -58465,79 +89586,86 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 370 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 560 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -58550,10 +89678,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -58562,26 +89690,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58591,6 +89727,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -58600,6 +89737,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -58614,33 +89752,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 371 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 561 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -58652,41 +89798,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -58700,10 +89843,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -58711,26 +89854,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58740,6 +89891,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -58749,6 +89901,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -58763,33 +89916,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 372 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 562 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id020 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -58803,39 +89964,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -58849,10 +90011,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -58860,19 +90022,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -58880,6 +90049,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58889,6 +90059,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -58898,6 +90069,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -58912,33 +90084,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 373 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 563 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id020 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -58952,29 +90132,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false @@ -59011,17 +90192,24 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -59029,6 +90217,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59038,6 +90227,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -59047,6 +90237,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -59061,33 +90252,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 374 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 564 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id022 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -59117,23 +90316,24 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 + LVCB: 16 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -59148,9 +90348,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -59158,19 +90358,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -59178,6 +90385,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59187,6 +90395,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -59196,6 +90405,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -59210,39 +90420,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 375 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 565 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id023 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -59250,56 +90468,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 128 - LSPA: 16 - LSPB: 4 - LVCA: 16 - LVCB: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 LVPA: 8 - LVPB: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -59307,26 +90526,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59336,6 +90563,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -59345,6 +90573,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -59359,96 +90588,101 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 376 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 566 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id024 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id021 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 + LSCA: 32 + LSCB: 8 LSPA: 8 - LSPB: 16 + LSPB: 32 LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -59456,26 +90690,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59485,6 +90727,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -59494,6 +90737,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -59508,33 +90752,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 377 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 567 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -59547,7 +90799,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -59560,44 +90812,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 4 + LSCA: 32 + LSCB: 8 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 2 + LVCA: 16 + LVCB: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -59605,26 +90858,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 4 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59634,6 +90893,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -59643,6 +90903,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -59657,75 +90918,86 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 378 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 568 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x16_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id026 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id021 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 + LSCA: 64 + LSCB: 8 LSPA: 8 - LSPB: 8 + LSPB: 32 LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 6400 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -59735,18 +91007,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -59754,19 +91026,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -59774,6 +91051,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59783,6 +91061,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -59792,6 +91071,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -59806,79 +91086,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 379 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 569 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id020 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 96 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -59891,10 +91182,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -59903,19 +91194,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -59923,6 +91219,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59932,6 +91229,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -59941,6 +91239,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -59955,79 +91254,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 380 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 570 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id022 - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 + LSCA: 96 + LSCB: 64 + LSPA: 5 LSPB: 8 - LVCA: 16 + LVCA: 48 LVCB: 32 - LVPA: 4 - LVPB: 2 + LVPA: 3 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -60040,11 +91350,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -60052,19 +91362,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -60072,6 +91387,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60081,6 +91397,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -60090,6 +91407,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -60104,46 +91422,56 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 381 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 571 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id023 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -60160,23 +91488,24 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 128 - LSPA: 16 - LSPB: 4 - LVCA: 16 - LVCB: 64 - LVPA: 8 - LVPB: 2 + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -60189,11 +91518,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -60201,19 +91530,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -60221,6 +91555,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60230,6 +91565,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -60239,6 +91575,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -60253,39 +91590,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 382 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 572 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id024 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -60293,39 +91640,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -60338,11 +91686,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -60353,23 +91701,31 @@ NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60379,6 +91735,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -60388,6 +91745,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -60402,39 +91760,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 383 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 573 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -60458,23 +91824,24 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -60487,11 +91854,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -60499,19 +91866,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -60519,6 +91893,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60528,6 +91903,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -60537,6 +91913,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -60551,79 +91928,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 384 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 574 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id026 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 128 - LSPA: 8 + LSCB: 64 + LSPA: 4 LSPB: 8 - LVCA: 32 + LVCA: 64 LVCB: 32 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -60636,11 +92022,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -60648,19 +92034,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -60668,6 +92059,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60677,6 +92069,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -60686,6 +92079,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -60700,79 +92094,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 385 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 575 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id020 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 4 - LSPB: 4 - LVCA: 16 - LVCB: 16 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -60785,11 +92190,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -60797,19 +92202,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -60817,6 +92229,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60826,6 +92239,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -60835,6 +92249,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -60849,48 +92264,56 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 386 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 576 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id027 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 2 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -60902,26 +92325,27 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 2 - LSPB: 2 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -60934,11 +92358,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -60954,11 +92378,18 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -60966,6 +92397,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60975,6 +92407,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -60984,6 +92417,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -60998,33 +92432,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 387 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 577 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id027 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -61038,7 +92480,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -61050,44 +92492,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 64 + LSCB: 32 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -61095,19 +92538,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -61115,6 +92565,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -61124,6 +92575,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -61133,6 +92585,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -61147,39 +92600,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 388 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: true - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 578 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id029 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -61195,7 +92656,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -61203,23 +92664,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3328 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 + LdsOffsetA_Blk: 2048 LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -61232,11 +92694,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -61247,16 +92709,23 @@ NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -61264,6 +92733,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -61273,6 +92743,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -61282,6 +92753,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -61296,14 +92768,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 389 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id028 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 579 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 @@ -61314,21 +92793,22 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id031 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -61344,31 +92824,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 + LSCA: 16 + LSCB: 32 + LSPA: 16 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -61381,11 +92862,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -61393,19 +92874,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -61413,6 +92901,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -61422,6 +92911,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -61431,6 +92921,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -61445,39 +92936,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 390 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 580 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 4 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id029 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -61493,48 +92992,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 - LSCB: 16 - LSPA: 4 - LSPB: 4 + LSCB: 32 + LSPA: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 16 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 16 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -61542,19 +93042,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -61562,6 +93069,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -61571,6 +93079,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -61580,6 +93089,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -61594,17 +93104,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 391 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 581 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 @@ -61612,21 +93129,22 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id030 - WorkGroupMapping: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -61642,31 +93160,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 8 - LVCA: 8 + LSPB: 32 + LVCA: 32 LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -61679,10 +93198,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 8 - MacroTileA: 8 + MacroTileA: 32 MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -61693,17 +93212,24 @@ NonTemporalC: 0 NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -61711,6 +93237,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -61720,6 +93247,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -61729,6 +93257,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -61743,14 +93272,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 392 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 582 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW1_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 4 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id028 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 @@ -61760,22 +93296,23 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id029 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -61783,7 +93320,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -61791,37 +93328,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 - LSPA: 4 - LSPB: 4 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 LVPA: 4 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -61829,9 +93367,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -61841,25 +93379,33 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -61869,6 +93415,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -61878,6 +93425,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -61892,33 +93440,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 393 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 583 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id030 + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -61940,48 +93496,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -61989,19 +93546,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -62009,6 +93573,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -62018,6 +93583,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -62027,6 +93593,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -62041,33 +93608,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 394 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 584 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id029 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -62081,8 +93656,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -62090,47 +93665,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 8 + LSCA: 128 + LSCB: 64 + LSPA: 4 LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -62138,19 +93714,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -62158,6 +93741,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -62167,6 +93751,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -62176,6 +93761,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -62190,33 +93776,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 395 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 585 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -62229,9 +93823,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -62239,47 +93833,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 + LSCA: 128 + LSCB: 64 + LSPA: 4 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -62287,26 +93882,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -62316,6 +93917,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -62325,6 +93927,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -62339,33 +93942,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 396 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 586 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id029 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -62379,65 +93992,68 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 64 + LSCB: 64 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -62445,17 +94061,25 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -62465,6 +94089,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -62474,8 +94099,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -62488,77 +94115,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 397 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 587 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id031 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -62571,29 +94209,40 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -62601,6 +94250,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -62610,6 +94260,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -62619,8 +94270,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -62633,78 +94286,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 398 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 588 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id032 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id035 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -62717,7 +94380,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -62725,21 +94388,30 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -62747,6 +94419,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -62756,6 +94429,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -62765,8 +94439,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -62779,74 +94455,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 399 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 589 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id032 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: true + DepthU: 16 + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 256 - LSPA: 8 - LSPB: 1 - LVCA: 32 - LVCB: 256 - LVPA: 8 - LVPB: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -62856,39 +94548,49 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -62898,6 +94600,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -62907,8 +94610,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -62921,74 +94626,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 400 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id032 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 590 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id033 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: true + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 256 + LSCA: 64 + LSCB: 96 LSPA: 8 - LSPB: 1 + LSPB: 5 LVCA: 32 - LVCB: 256 - LVPA: 8 - LVPB: 1 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -62998,39 +94719,49 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -63040,6 +94771,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -63049,8 +94781,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -63063,74 +94797,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 401 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM08 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id032 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 591 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id033 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: true + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 256 + LSCA: 64 + LSCB: 96 LSPA: 8 - LSPB: 1 + LSPB: 5 LVCA: 32 - LVCB: 256 - LVPA: 8 - LVPB: 1 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -63140,39 +94890,49 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -63182,6 +94942,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -63191,8 +94952,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -63205,74 +94968,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 402 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM64 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id032 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 592 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id033 - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -63285,36 +95064,46 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -63324,6 +95113,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -63333,8 +95123,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -63347,78 +95139,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 403 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 593 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id032 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 + LSCB: 128 + LSPA: 8 LSPB: 4 - LVCA: 64 + LVCA: 32 LVCB: 64 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -63431,29 +95235,40 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -63461,6 +95276,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -63470,6 +95286,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -63479,8 +95296,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -63493,33 +95312,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 404 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 594 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id036 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id035 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -63531,40 +95358,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 128 LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -63579,27 +95408,36 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -63607,6 +95445,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -63616,6 +95455,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -63625,8 +95465,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -63639,13 +95481,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 405 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 595 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id032 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 @@ -63655,17 +95505,19 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -63677,40 +95529,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 128 LSPA: 8 - LSPB: 8 + LSPB: 4 LVCA: 32 - LVCB: 32 - LVPA: 8 + LVCB: 64 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -63724,28 +95578,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -63753,6 +95616,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -63762,6 +95626,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -63771,8 +95636,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -63785,74 +95652,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 406 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG08_32_01_WGM01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id036 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 596 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id033 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 + LSCA: 128 + LSCB: 64 + LSPA: 4 LSPB: 8 - LVCA: 16 + LVCA: 64 LVCB: 32 - LVPA: 4 - LVPB: 2 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -63865,36 +95748,48 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -63904,6 +95799,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -63913,8 +95809,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -63927,12 +95825,20 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 407 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 597 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -63943,58 +95849,60 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id033 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -64011,29 +95919,40 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -64041,6 +95960,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -64050,6 +95970,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -64059,8 +95980,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -64073,74 +95996,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 408 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 598 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id036 - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id035 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 + LSCA: 128 + LSCB: 64 + LSPA: 4 LSPB: 8 - LVCA: 16 + LVCA: 64 LVCB: 32 - LVPA: 4 - LVPB: 2 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -64153,36 +96090,46 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -64192,6 +96139,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -64201,8 +96149,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -64215,70 +96165,82 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 409 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 599 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id032 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id035 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -64299,29 +96261,40 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -64329,6 +96302,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -64338,6 +96312,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -64347,8 +96322,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -64361,74 +96338,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 410 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM08 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 600 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id036 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id035 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -64441,36 +96432,48 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -64480,6 +96483,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -64489,8 +96493,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -64503,32 +96509,39 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 411 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM08 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 601 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR0_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id032 - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id035 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -64536,14 +96549,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -64560,50 +96573,60 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 - LVPA: 1 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -64613,13 +96636,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -64642,6 +96667,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -64658,20 +96684,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 412 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM1 + SolutionIndex: 602 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -64679,16 +96705,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -64696,14 +96720,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -64720,54 +96744,60 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 LVCB: 16 - LVPA: 1 - LVPB: 1 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -64777,6 +96807,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -64784,6 +96815,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -64806,6 +96838,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -64822,37 +96855,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 413 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM8 + SolutionIndex: 603 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -64860,13 +96891,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -64884,50 +96915,58 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 - LVPA: 1 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -64937,13 +96976,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -64966,6 +97007,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -64982,20 +97024,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 414 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM64 + SolutionIndex: 604 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -65003,16 +97045,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -65020,7 +97062,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -65044,54 +97086,58 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 LVCB: 16 - LVPA: 1 - LVPB: 1 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -65101,6 +97147,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -65108,6 +97155,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65130,6 +97178,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -65146,37 +97195,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 415 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM64 + SolutionIndex: 605 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -65184,14 +97233,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -65208,25 +97257,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 - LSPA: 4 - LSPB: 16 - LVCA: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 LVCB: 8 - LVPA: 1 - LVPB: 4 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -65234,24 +97288,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -65261,13 +97320,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65290,6 +97351,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -65306,8 +97368,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 416 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 606 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -65316,10 +97378,10 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -65327,16 +97389,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -65344,7 +97404,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -65368,29 +97428,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -65399,23 +97460,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -65425,6 +97489,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -65432,6 +97497,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65454,6 +97520,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -65470,16 +97537,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 417 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 607 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -65491,10 +97558,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -65514,53 +97581,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 1 - LVPB: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -65569,13 +97641,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -65585,13 +97660,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65630,8 +97707,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 418 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM8 + SolutionIndex: 608 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -65640,27 +97717,25 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 1 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -65668,60 +97743,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -65729,13 +97809,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -65745,13 +97828,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65790,37 +97875,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 419 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 609 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -65834,37 +97917,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -65878,10 +97966,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -65889,13 +97977,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -65905,13 +97996,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65950,8 +98043,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 420 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 610 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -65960,23 +98053,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -65988,47 +98079,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -66041,11 +98133,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -66053,13 +98145,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -66069,6 +98164,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -66076,6 +98172,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66114,8 +98211,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 421 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 611 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -66123,28 +98220,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -66158,53 +98253,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 16 LSCB: 32 - LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 1 - LVPB: 4 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -66213,13 +98313,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -66229,13 +98332,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66274,37 +98379,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 422 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 612 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x16_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -66312,60 +98415,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -66373,13 +98481,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -66389,13 +98498,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66434,15 +98545,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 423 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 613 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -66454,17 +98565,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -66478,53 +98589,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 4 + LVCB: 32 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -66533,13 +98649,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -66549,13 +98666,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66594,14 +98713,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 424 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 614 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -66614,9 +98733,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -66624,7 +98743,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -66639,36 +98758,37 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -66678,18 +98798,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -66697,13 +98817,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -66713,6 +98836,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -66720,6 +98844,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66758,37 +98883,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 425 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 615 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -66796,7 +98919,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -66804,39 +98927,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -66849,7 +98973,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -66862,14 +98986,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -66879,8 +99004,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -66925,8 +99051,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 426 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 616 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -66934,22 +99060,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -66961,13 +99087,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -66985,6 +99111,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 @@ -66995,9 +99122,13 @@ LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -67010,7 +99141,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -67024,13 +99155,14 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -67040,7 +99172,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -67086,8 +99219,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 427 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 617 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -67110,11 +99243,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -67122,13 +99255,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -67142,10 +99275,11 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 @@ -67156,15 +99290,19 @@ LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -67184,14 +99322,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -67201,7 +99340,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -67247,35 +99387,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 428 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 618 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -67283,41 +99423,42 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 - LVPA: 4 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 @@ -67329,7 +99470,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -67337,10 +99478,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67348,15 +99489,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -67366,8 +99506,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -67412,35 +99553,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 429 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM1 + SolutionIndex: 619 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -67448,49 +99591,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -67498,10 +99646,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67509,15 +99657,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -67527,8 +99676,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -67573,35 +99723,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 430 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 620 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -67609,43 +99759,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -67658,11 +99813,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67671,14 +99826,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -67688,7 +99842,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -67734,15 +99889,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 431 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 621 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -67750,19 +99905,21 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -67770,13 +99927,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -67790,29 +99947,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 + LSCA: 32 + LSCB: 32 + LSPA: 8 LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 + LVCA: 32 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -67820,10 +99982,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67831,15 +99993,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -67849,8 +100012,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -67895,35 +100059,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 432 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 622 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -67931,13 +100095,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -67951,23 +100115,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 + LSCB: 64 + LSPA: 4 + LSPB: 4 LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -67980,11 +100149,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67992,15 +100161,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -68010,7 +100180,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -68056,35 +100227,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 433 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 623 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -68099,30 +100270,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false @@ -68158,14 +100330,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -68175,8 +100346,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -68221,8 +100393,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 434 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM8 + SolutionIndex: 624 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -68230,26 +100402,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -68257,7 +100431,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -68265,45 +100439,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 - LVPA: 4 - LVPB: 2 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -68311,10 +100486,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68322,15 +100497,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -68340,8 +100516,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -68386,35 +100563,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 435 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM8 + SolutionIndex: 625 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -68428,37 +100605,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 LVCB: 32 - LVPA: 2 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -68473,9 +100655,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68484,14 +100666,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -68501,8 +100684,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -68547,15 +100731,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 436 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + SolutionIndex: 626 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -68563,19 +100747,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -68583,43 +100767,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -68632,11 +100821,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68645,14 +100834,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -68662,7 +100850,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -68708,15 +100897,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 437 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + SolutionIndex: 627 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -68724,19 +100913,21 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -68744,13 +100935,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -68764,23 +100955,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 + LSCB: 64 + LSPA: 4 + LSPB: 4 LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -68793,11 +100989,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68805,15 +101001,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -68823,7 +101020,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -68869,35 +101067,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 438 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + SolutionIndex: 628 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -68905,49 +101103,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -68956,9 +101159,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68966,15 +101169,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -68984,8 +101188,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -69030,15 +101235,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 439 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 629 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -69046,19 +101251,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -69072,54 +101277,59 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 LVCB: 16 - LVPA: 2 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69128,14 +101338,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -69145,8 +101356,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -69191,8 +101403,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 440 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 630 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -69201,25 +101413,25 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -69233,37 +101445,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 LVCA: 64 - LVCB: 16 + LVCB: 64 LVPA: 2 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69277,10 +101494,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69288,15 +101505,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -69306,7 +101524,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -69352,35 +101571,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 441 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 631 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW2_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -69388,13 +101607,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -69404,43 +101623,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 4 + LSPA: 8 LSPB: 8 - LVCA: 64 + LVCA: 32 LVCB: 32 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -69449,15 +101673,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -69467,7 +101692,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -69513,14 +101739,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 442 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + SolutionIndex: 632 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO1_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 2] @@ -69533,15 +101759,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -69549,13 +101775,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -69573,25 +101799,30 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 2 - LSPB: 4 + LSPA: 4 + LSPB: 8 LVCA: 64 LVCB: 32 - LVPA: 2 - LVPB: 4 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -69610,15 +101841,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -69628,8 +101860,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -69674,8 +101907,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 443 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + SolutionIndex: 633 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -69695,14 +101928,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -69716,37 +101949,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 LVCB: 32 - LVPA: 2 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69761,9 +101999,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69772,14 +102010,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -69789,7 +102028,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -69835,15 +102075,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 444 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + SolutionIndex: 634 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -69851,19 +102091,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -69871,60 +102111,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69932,15 +102177,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -69950,8 +102194,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -69996,15 +102241,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 445 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + SolutionIndex: 635 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -70012,19 +102257,21 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -70038,7 +102285,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -70056,19 +102303,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 + LSCB: 64 + LSPA: 4 LSPB: 4 LVCA: 64 - LVCB: 32 - LVPA: 2 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -70083,9 +102335,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70095,13 +102347,14 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -70111,7 +102364,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -70157,15 +102411,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 446 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + SolutionIndex: 636 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -70178,14 +102432,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -70193,14 +102447,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -70217,40 +102471,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 + LSCB: 32 + LSPA: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70258,15 +102513,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -70276,6 +102530,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -70322,20 +102577,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 447 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 637 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -70343,14 +102598,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -70358,7 +102615,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -70366,56 +102623,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70423,15 +102681,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -70441,6 +102700,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -70487,31 +102747,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 448 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + SolutionIndex: 638 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -70523,43 +102783,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 + LSCA: 32 + LSCB: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -70569,18 +102830,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70588,13 +102849,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -70604,6 +102868,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -70650,37 +102915,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 449 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + SolutionIndex: 639 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -70688,7 +102951,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -70696,56 +102959,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70753,8 +103017,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -70762,6 +103026,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -70771,6 +103036,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -70817,35 +103083,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 450 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM1 + SolutionIndex: 640 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -70861,56 +103127,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 8 + LSCA: 32 + LSCB: 32 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 2 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70918,15 +103185,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -70936,6 +103204,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -70982,35 +103251,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 451 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 641 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -71025,36 +103294,37 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -71073,9 +103343,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71083,13 +103353,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -71099,6 +103372,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -71145,8 +103419,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 452 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 642 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -71154,28 +103428,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -71183,7 +103455,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -71191,39 +103463,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -71236,11 +103509,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71248,8 +103521,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -71257,6 +103530,7 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -71266,6 +103540,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -71312,8 +103587,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 453 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + SolutionIndex: 643 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -71321,14 +103596,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -71336,11 +103611,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -71348,7 +103623,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -71356,56 +103631,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71413,13 +103689,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -71429,6 +103706,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -71475,37 +103753,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 454 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + SolutionIndex: 644 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -71513,64 +103791,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71578,15 +103857,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -71596,6 +103874,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -71642,35 +103921,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 455 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM8 + SolutionIndex: 645 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -71686,56 +103967,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71743,15 +104025,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -71761,6 +104044,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -71807,35 +104091,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 456 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + SolutionIndex: 646 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -71850,57 +104134,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 + LSCA: 32 + LSCB: 64 LSPA: 16 LSPB: 8 LVCA: 16 LVCB: 32 - LVPA: 4 - LVPB: 2 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71908,13 +104193,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -71924,6 +104212,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -71970,37 +104259,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 457 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + SolutionIndex: 647 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -72016,39 +104303,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 32 - LVCB: 16 - LVPA: 2 + LVCB: 32 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -72062,9 +104350,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72073,15 +104361,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -72091,6 +104380,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -72137,8 +104427,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 458 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + SolutionIndex: 648 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -72146,18 +104436,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -72165,7 +104455,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -72173,47 +104463,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -72226,11 +104517,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72238,13 +104529,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -72254,6 +104548,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -72300,8 +104595,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 459 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + SolutionIndex: 649 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -72309,28 +104604,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -72338,7 +104631,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -72346,56 +104639,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 + LSCA: 64 + LSCB: 32 + LSPA: 4 LSPB: 8 - LVCA: 32 + LVCA: 64 LVCB: 32 - LVPA: 2 - LVPB: 2 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72403,15 +104697,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -72421,6 +104716,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -72467,35 +104763,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 460 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + SolutionIndex: 650 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -72503,64 +104799,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 + LSCA: 32 + LSCB: 32 LSPA: 8 LSPB: 8 LVCA: 32 LVCB: 32 - LVPA: 2 - LVPB: 2 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72568,13 +104865,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -72584,6 +104884,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -72630,33 +104931,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 461 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + SolutionIndex: 651 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -72668,40 +104967,40 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -72722,11 +105021,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72734,13 +105033,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -72752,7 +105054,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -72797,8 +105099,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 462 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WGM8 + SolutionIndex: 652 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -72807,23 +105109,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -72843,40 +105143,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 128 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -72890,9 +105190,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72901,15 +105201,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -72966,8 +105267,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 463 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WGM8 + SolutionIndex: 653 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -72976,17 +105277,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -72994,7 +105295,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -73022,28 +105323,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 2 - LSPB: 4 - LVCA: 128 - LVCB: 64 - LVPA: 2 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -73057,10 +105358,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73068,15 +105369,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -73088,7 +105390,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -73133,8 +105435,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 464 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WGM8 + SolutionIndex: 654 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -73143,19 +105445,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -73169,44 +105471,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -73216,7 +105518,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -73224,10 +105526,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73235,13 +105537,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -73298,37 +105603,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 465 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WGM8 + SolutionIndex: 655 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -73336,13 +105639,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -73356,30 +105659,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -73387,10 +105694,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73398,14 +105705,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -73418,7 +105725,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -73464,31 +105771,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 466 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 656 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -73506,7 +105813,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -73520,24 +105827,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 32 LSPA: 4 - LSPB: 4 + LSPB: 8 LVCA: 64 - LVCB: 64 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -73552,9 +105863,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73562,14 +105873,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -73582,7 +105893,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -73628,8 +105939,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 467 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 657 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -73638,17 +105949,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -73664,7 +105975,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -73684,45 +105995,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 16 LSPA: 4 - LSPB: 4 + LSPB: 16 LVCA: 64 - LVCB: 64 + LVCB: 16 LVPA: 4 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73730,13 +106041,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -73796,31 +106107,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 468 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 658 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -73832,48 +106143,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -73886,7 +106197,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -73899,13 +106210,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -73964,8 +106273,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 469 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 659 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -73980,15 +106289,17 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -74007,37 +106318,37 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 + LSCB: 32 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -74047,18 +106358,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74066,13 +106377,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -74132,15 +106441,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 470 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 660 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -74152,15 +106461,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -74195,17 +106506,17 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 8 + LSCB: 16 LSPA: 8 - LSPB: 32 + LSPB: 16 LVCA: 32 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -74224,9 +106535,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74234,14 +106545,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -74255,7 +106566,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -74300,28 +106611,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 471 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + SolutionIndex: 661 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 1 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -74336,13 +106647,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -74356,30 +106667,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -74387,10 +106702,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74398,14 +106713,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -74418,7 +106733,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -74464,35 +106779,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 472 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 + SolutionIndex: 662 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -74500,48 +106815,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 4 - LVPA: 8 + LVCB: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -74554,11 +106869,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74566,12 +106881,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -74585,7 +106902,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -74630,33 +106947,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 473 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x16_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 663 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 1 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -74668,54 +106983,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 + LSCB: 32 LSPA: 8 - LSPB: 32 + LSPB: 8 LVCA: 32 - LVCB: 4 + LVCB: 32 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -74724,9 +107039,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74734,11 +107049,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -74798,15 +107115,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 474 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 664 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 2] ThreadTile0: 4 @@ -74819,12 +107136,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -74843,41 +107158,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 64 LSCB: 64 - LSPA: 5 - LSPB: 8 - LVCA: 48 - LVCB: 32 - LVPA: 3 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -74891,9 +107206,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -74902,12 +107217,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -74966,8 +107283,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 475 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 665 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -74976,13 +107293,13 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -74991,8 +107308,6 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -75020,7 +107335,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -75030,22 +107345,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 64 LSCB: 64 - LSPA: 5 + LSPA: 8 LSPB: 8 - LVCA: 48 + LVCA: 32 LVCB: 32 - LVPA: 3 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -75059,9 +107374,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -75070,11 +107385,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -75134,8 +107449,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 476 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 666 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -75144,10 +107459,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -75159,7 +107474,7 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -75179,7 +107494,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -75188,7 +107503,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -75199,21 +107514,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 96 + LSCB: 64 LSPA: 8 - LSPB: 5 + LSPB: 8 LVCA: 32 - LVCB: 48 + LVCB: 32 LVPA: 4 - LVPB: 3 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -75228,9 +107543,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75238,12 +107553,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -75302,8 +107619,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 477 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 667 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -75312,11 +107629,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -75327,8 +107644,6 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -75340,7 +107655,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -75348,57 +107663,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 - LVPA: 4 - LVPB: 2 + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 6656 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 4608 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75406,13 +107721,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 @@ -75427,7 +107742,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -75472,31 +107787,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 478 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 668 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -75508,7 +107823,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -75534,14 +107849,14 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 3584 LdsNumElementsAlignedA: 512 @@ -75555,7 +107870,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -75563,10 +107878,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75574,8 +107889,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -75640,31 +107955,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 479 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 669 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -75683,37 +107998,37 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 64 - LSPA: 4 + LSCB: 128 + LSPA: 8 LSPB: 8 - LVCA: 64 + LVCA: 32 LVCB: 32 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -75732,9 +108047,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75742,11 +108057,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -75806,8 +108123,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 480 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 670 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW4_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -75816,23 +108133,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -75844,7 +108159,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -75860,7 +108175,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -75870,28 +108185,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 8 + LSPA: 16 LSPB: 8 - LVCA: 32 + LVCA: 16 LVCB: 32 - LVPA: 4 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -75899,9 +108214,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -75910,14 +108225,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -75976,14 +108291,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 481 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 671 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -75997,10 +108312,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -76020,36 +108335,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 + LSCB: 16 + LSPA: 4 LSPB: 16 - LVCA: 16 + LVCA: 64 LVCB: 16 LVPA: 4 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3328 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -76059,18 +108374,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76078,13 +108393,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -76144,29 +108459,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 482 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 672 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -76180,48 +108495,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -76234,7 +108549,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -76247,12 +108562,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -76312,8 +108625,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 483 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + SolutionIndex: 673 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -76332,11 +108645,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -76356,56 +108671,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 8 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 8 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -76414,14 +108729,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -76480,35 +108795,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 484 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 674 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -76523,41 +108838,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -76571,9 +108886,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -76582,14 +108897,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -76648,31 +108961,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 485 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM8 + SolutionIndex: 675 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -76684,48 +108999,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 16 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -76738,11 +109053,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76752,12 +109067,10 @@ NonTemporalC: 0 NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -76816,15 +109129,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 486 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM8 + SolutionIndex: 676 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 2] ThreadTile0: 4 @@ -76832,19 +109145,21 @@ ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -76860,57 +109175,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 + LSCA: 64 + LSCB: 32 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76918,13 +109233,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -76984,28 +109299,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 487 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW1_WG16_4_4_WGM8 + SolutionIndex: 677 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -77020,7 +109335,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -77028,36 +109343,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -77074,11 +109389,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77087,13 +109402,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -77107,7 +109422,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -77152,15 +109467,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 488 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 678 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -77172,11 +109487,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 4, 4] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -77188,7 +109503,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -77214,18 +109529,18 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 2 - LSPB: 4 - LVCA: 128 - LVCB: 64 - LVPA: 2 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -77235,7 +109550,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -77243,10 +109558,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77254,14 +109569,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -77320,20 +109635,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 489 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 679 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -77341,14 +109656,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -77364,40 +109679,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 LSPA: 4 - LSPB: 8 + LSPB: 4 LVCA: 64 - LVCB: 32 - LVPA: 2 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -77411,9 +109726,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -77422,14 +109737,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -77488,8 +109803,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 490 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 680 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -77498,19 +109813,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -77524,14 +109839,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -77550,18 +109865,18 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 4 + LSPA: 8 LSPB: 8 - LVCA: 64 + LVCA: 32 LVCB: 32 - LVPA: 2 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -77578,10 +109893,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -77590,12 +109905,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -77609,7 +109926,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -77654,8 +109971,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 491 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 681 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -77664,10 +109981,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -77676,11 +109993,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -77700,36 +110015,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 + LSCB: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 LVPA: 4 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3328 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -77739,34 +110054,32 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -77781,7 +110094,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -77810,7 +110123,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -77827,28 +110139,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 492 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 682 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -77871,36 +110183,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 + LSCB: 32 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -77910,34 +110222,32 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -77981,7 +110291,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -77998,15 +110307,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 493 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 683 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -78018,9 +110327,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -78034,48 +110343,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -78088,7 +110397,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -78096,18 +110405,18 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -78150,7 +110459,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -78167,8 +110475,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 494 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 684 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -78183,17 +110491,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -78205,40 +110511,40 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -78252,33 +110558,33 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -78321,7 +110627,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -78338,15 +110643,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 495 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 685 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -78358,13 +110663,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -78376,7 +110679,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -78403,27 +110706,27 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 96 + LSCB: 32 LSPA: 8 - LSPB: 5 + LSPB: 16 LVCA: 32 - LVCB: 48 + LVCB: 16 LVPA: 4 - LVPB: 3 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -78432,24 +110735,22 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -78463,7 +110764,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -78492,7 +110793,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -78509,33 +110809,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 496 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 686 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -78547,48 +110847,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 96 - LSPA: 8 - LSPB: 5 - LVCA: 32 - LVCB: 48 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 - LVPB: 3 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -78601,26 +110901,26 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -78663,7 +110963,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -78680,8 +110979,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 497 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 687 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -78690,23 +110989,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -78718,7 +111015,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -78726,46 +111023,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 96 - LSPA: 8 - LSPB: 5 - LVCA: 32 - LVCB: 48 - LVPA: 4 - LVPB: 3 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -78773,25 +111070,23 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -78834,7 +111129,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -78851,33 +111145,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 498 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 688 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -78889,14 +111183,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -78915,56 +111209,52 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -79007,7 +111297,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -79024,31 +111313,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 499 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 689 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -79060,76 +111351,76 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -79176,7 +111467,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -79193,33 +111483,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 500 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 690 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -79231,14 +111519,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -79257,28 +111545,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -79286,25 +111574,25 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -79347,7 +111635,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -79364,15 +111651,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 501 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 691 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_8_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 @@ -79385,12 +111672,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -79402,7 +111687,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -79428,28 +111713,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -79457,27 +111742,25 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -79520,7 +111803,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -79537,15 +111819,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 502 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 692 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 @@ -79558,10 +111840,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -79573,7 +111855,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -79599,56 +111881,54 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 4 + LSPA: 8 LSPB: 8 - LVCA: 64 + LVCA: 32 LVCB: 32 - LVPA: 2 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -79691,7 +111971,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -79708,14 +111987,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 503 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 693 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [8, 4] @@ -79729,10 +112008,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -79744,14 +112023,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -79770,28 +112049,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -79799,25 +112078,25 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -79860,7 +112139,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -79877,15 +112155,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 504 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 694 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 @@ -79898,12 +112176,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -79941,22 +112217,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -79970,10 +112246,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79983,14 +112259,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -80004,7 +112280,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -80040,6 +112316,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -80050,8 +112327,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 505 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 695 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -80060,11 +112337,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -80072,7 +112349,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -80092,7 +112369,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -80106,28 +112383,24 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -80141,10 +112414,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80154,14 +112427,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -80174,8 +112447,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -80211,6 +112484,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -80221,8 +112495,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 506 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR0_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 696 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -80231,19 +112505,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -80257,7 +112531,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -80265,56 +112539,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -80325,13 +112599,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -80382,6 +112656,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -80392,14 +112667,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 507 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + SolutionIndex: 697 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -80408,15 +112683,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -80428,7 +112703,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -80436,56 +112711,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -80496,13 +112771,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -80553,6 +112828,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -80563,14 +112839,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 508 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + SolutionIndex: 698 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -80579,15 +112855,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -80599,54 +112875,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -80654,9 +112926,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -80667,12 +112939,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -80685,7 +112959,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -80722,6 +112996,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -80732,14 +113007,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 509 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + SolutionIndex: 699 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -80748,17 +113023,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -80770,54 +113043,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -80825,9 +113094,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -80838,12 +113107,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -80856,7 +113127,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -80893,6 +113164,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -80903,14 +113175,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 510 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + SolutionIndex: 700 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -80919,17 +113191,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -80941,7 +113211,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -80949,57 +113219,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81009,14 +113279,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -81030,7 +113300,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -81066,6 +113336,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -81076,15 +113347,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 511 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + SolutionIndex: 701 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -81092,15 +113363,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -81112,65 +113383,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81180,12 +113447,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -81198,8 +113467,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -81235,6 +113504,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -81245,15 +113515,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 512 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + SolutionIndex: 702 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -81261,21 +113531,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -81283,7 +113551,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -81303,34 +113571,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 4 - LSPB: 8 + LSPB: 4 LVCA: 64 - LVCB: 32 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -81339,23 +113607,25 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -81370,7 +113640,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -81399,12 +113669,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -81415,15 +113687,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 513 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM8 + SolutionIndex: 703 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -81435,11 +113707,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -81451,54 +113723,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -81506,25 +113774,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -81537,7 +113807,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -81567,12 +113837,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -81583,31 +113855,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 514 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 704 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -81619,7 +113891,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -81645,22 +113917,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -81673,26 +113945,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -81735,12 +114009,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -81751,8 +114027,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 515 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 705 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -81761,11 +114037,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -81773,13 +114049,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -81787,13 +114063,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -81807,28 +114083,24 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 4 - LSPB: 8 + LSPB: 4 LVCA: 64 - LVCB: 32 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -81841,26 +114113,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -81873,7 +114147,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -81903,12 +114177,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -81919,8 +114195,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 516 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 706 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -81929,21 +114205,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -81975,60 +114251,62 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -82071,12 +114349,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -82087,28 +114367,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 517 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x16_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM1 + SolutionIndex: 707 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -82123,78 +114403,78 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -82207,7 +114487,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -82237,12 +114517,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -82253,15 +114535,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 518 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 708 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -82269,17 +114551,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -82291,14 +114571,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -82317,28 +114597,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 - LSPA: 16 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 32 LVCB: 32 - LVPA: 8 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -82346,23 +114626,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -82405,12 +114689,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -82421,14 +114707,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 519 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + SolutionIndex: 709 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -82442,12 +114728,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -82459,7 +114743,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -82467,46 +114751,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 4 + LSCB: 64 + LSPA: 8 LSPB: 8 - LVCA: 64 + LVCA: 32 LVCB: 32 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -82515,24 +114799,26 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -82575,12 +114861,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -82591,15 +114879,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 520 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_8_2_WGM8 + SolutionIndex: 710 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -82607,15 +114895,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -82627,7 +114915,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -82635,8 +114923,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -82647,7 +114935,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 @@ -82655,20 +114943,20 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -82681,7 +114969,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -82689,12 +114977,14 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -82743,12 +115033,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -82759,8 +115051,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 521 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 711 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -82779,15 +115071,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -82795,7 +115087,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -82803,40 +115095,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -82849,7 +115141,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -82857,18 +115149,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -82911,12 +115205,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -82927,8 +115223,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 522 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 712 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -82943,15 +115239,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -82971,36 +115267,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -83010,33 +115306,35 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -83079,12 +115377,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -83095,15 +115395,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 523 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 + SolutionIndex: 713 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -83111,13 +115411,13 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -83138,71 +115438,75 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 8 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -83245,12 +115549,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -83261,14 +115567,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 524 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 + SolutionIndex: 714 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -83281,13 +115587,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -83299,14 +115603,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -83325,28 +115629,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -83354,24 +115658,24 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -83415,12 +115719,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -83431,31 +115737,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 525 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 715 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -83529,7 +115837,9 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -83581,12 +115891,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -83597,8 +115909,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 526 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 716 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -83635,7 +115947,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -83662,27 +115974,27 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 64 LSPA: 8 - LSPB: 8 + LSPB: 4 LVCA: 32 - LVCB: 32 + LVCB: 64 LVPA: 8 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -83690,25 +116002,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -83751,12 +116065,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -83767,20 +116083,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 527 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM1 + SolutionIndex: 717 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -83788,10 +116104,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -83803,7 +116119,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -83823,28 +116139,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 4 + LSPA: 8 LSPB: 4 - LVCA: 64 + LVCA: 32 LVCB: 64 - LVPA: 4 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -83857,26 +116173,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -83919,12 +116237,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -83935,8 +116255,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 528 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 718 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -83945,21 +116265,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -83987,7 +116307,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -83997,22 +116317,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 96 LSCB: 64 - LSPA: 8 + LSPA: 5 LSPB: 8 - LVCA: 32 + LVCA: 48 LVCB: 32 - LVPA: 4 + LVPA: 3 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -84026,22 +116346,24 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -84085,12 +116407,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -84101,8 +116425,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 529 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 719 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84111,10 +116435,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -84126,7 +116450,7 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -84139,54 +116463,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -84194,24 +116518,24 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -84255,12 +116579,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -84271,31 +116597,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 530 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 + SolutionIndex: 720 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -84315,40 +116643,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 8 + LSCB: 32 + LSPA: 4 LSPB: 8 - LVCA: 32 + LVCA: 64 LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3328 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -84363,23 +116691,25 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -84423,12 +116753,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -84439,8 +116771,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 531 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 721 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_6_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84449,19 +116781,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -84475,7 +116807,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84483,40 +116815,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 LVPA: 4 - LVPB: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -84529,24 +116861,26 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -84589,12 +116923,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -84605,8 +116941,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 532 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 722 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84615,23 +116951,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -84643,48 +116979,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 LVPA: 4 - LVPB: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -84697,26 +117033,26 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -84759,12 +117095,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -84775,8 +117113,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 533 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 723 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84785,21 +117123,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -84811,7 +117151,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84819,72 +117159,74 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 LVPA: 4 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -84898,7 +117240,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -84927,12 +117269,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -84943,31 +117287,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 534 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + SolutionIndex: 724 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -84979,7 +117323,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84987,46 +117331,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 LVPA: 4 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -85035,24 +117379,26 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -85066,7 +117412,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -85095,12 +117441,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -85111,31 +117459,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 535 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + SolutionIndex: 725 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -85147,7 +117495,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -85173,22 +117521,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 128 - LSPA: 4 + LSPA: 8 LSPB: 4 - LVCA: 64 + LVCA: 32 LVCB: 64 - LVPA: 2 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -85201,26 +117549,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -85234,7 +117584,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -85263,12 +117613,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -85279,8 +117631,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 536 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW2_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 726 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -85289,10 +117641,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -85301,9 +117653,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -85323,72 +117675,74 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 128 LSPA: 8 - LSPB: 8 + LSPB: 4 LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 64 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -85402,7 +117756,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -85431,12 +117785,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -85447,28 +117803,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 537 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO1_VW2_WG8_16_2_WGM8 + SolutionIndex: 727 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -85483,7 +117839,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -85491,46 +117847,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -85539,23 +117895,25 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -85599,12 +117957,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -85615,31 +117975,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 538 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + SolutionIndex: 728 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -85667,7 +118027,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -85678,21 +118038,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 128 LSPA: 8 - LSPB: 8 + LSPB: 4 LVCA: 32 - LVCB: 32 + LVCB: 64 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -85707,24 +118067,26 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -85767,12 +118129,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -85783,8 +118147,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 539 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 729 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -85793,11 +118157,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -85819,14 +118183,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -85845,28 +118209,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -85874,19 +118234,23 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -85903,8 +118267,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -85933,12 +118297,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -85949,33 +118315,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 540 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + SolutionIndex: 730 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_DTL0_EPS0_FL0_GRVW4_PGR0_PLR0_TT4_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -85994,41 +118358,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 + LSCB: 128 + LSPA: 8 LSPB: 4 - LVCA: 64 + LVCA: 32 LVCB: 64 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -86043,23 +118407,23 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -86103,12 +118467,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -86119,8 +118485,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 541 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 731 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86129,21 +118495,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -86155,7 +118523,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -86163,70 +118531,72 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -86269,12 +118639,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -86285,31 +118657,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 542 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + SolutionIndex: 732 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -86323,7 +118695,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -86343,34 +118715,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -86378,25 +118750,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -86410,7 +118784,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -86439,12 +118813,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -86455,35 +118831,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 543 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM8 + SolutionIndex: 733 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -86491,7 +118867,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -86511,20 +118887,20 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 @@ -86538,7 +118914,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -86546,19 +118922,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -86578,7 +118956,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -86607,12 +118985,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -86623,31 +119003,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 544 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM8 + SolutionIndex: 734 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -86659,7 +119039,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -86667,46 +119047,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -86714,24 +119094,26 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -86775,12 +119157,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -86791,31 +119175,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 545 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 735 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -86827,7 +119211,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -86853,53 +119237,55 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -86914,7 +119300,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -86943,12 +119329,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -86959,20 +119347,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 546 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 736 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -86980,10 +119368,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -86995,48 +119383,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -87049,26 +119437,26 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -87111,12 +119499,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -87127,8 +119517,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 547 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 737 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87137,21 +119527,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -87163,13 +119555,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -87183,28 +119575,24 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 4 + LSPA: 8 LSPB: 4 - LVCA: 64 + LVCA: 32 LVCB: 64 - LVPA: 4 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -87217,26 +119605,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -87249,7 +119639,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -87279,12 +119669,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -87295,8 +119687,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 548 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 738 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87305,21 +119697,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -87331,78 +119723,78 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 8 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -87415,7 +119807,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -87445,12 +119837,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -87461,33 +119855,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 549 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 739 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -87499,15 +119891,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -87515,38 +119907,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -87554,23 +119946,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -87613,12 +120009,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -87629,15 +120027,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 550 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 740 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 4] ThreadTile0: 2 @@ -87650,12 +120048,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -87687,7 +120083,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -87729,12 +120125,14 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -87783,12 +120181,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -87799,8 +120199,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 551 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM8 + SolutionIndex: 741 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87819,9 +120219,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -87843,30 +120243,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 64 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 8 LVPB: 4 LdcEqualsLdd: false @@ -87897,18 +120297,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -87951,12 +120353,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -87967,8 +120371,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 552 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + SolutionIndex: 742 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87983,11 +120387,11 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -88003,7 +120407,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88011,72 +120415,74 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 32 LSPA: 8 LSPB: 8 LVCA: 32 LVCB: 32 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -88119,12 +120525,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -88135,31 +120543,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 553 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 743 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88171,54 +120579,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -88226,25 +120634,25 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -88287,12 +120695,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -88303,31 +120713,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 554 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 744 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88339,7 +120751,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88375,12 +120787,12 @@ LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -88393,7 +120805,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -88401,18 +120813,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -88455,12 +120869,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -88471,8 +120887,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 555 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + SolutionIndex: 745 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88495,7 +120911,7 @@ WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88527,28 +120943,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 8 LSCB: 32 - LSPA: 8 + LSPA: 32 LSPB: 8 - LVCA: 32 + LVCA: 8 LVCB: 32 - LVPA: 8 + LVPA: 32 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -88562,24 +120978,26 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 @@ -88623,12 +121041,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -88639,29 +121059,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 556 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM8 + SolutionIndex: 746 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 @@ -88675,54 +121095,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -88730,25 +121150,25 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -88791,12 +121211,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -88807,31 +121229,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 557 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 747 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88843,7 +121267,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88852,71 +121276,73 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -88959,12 +121385,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -88975,35 +121403,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 558 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 748 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -89011,7 +121439,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89031,34 +121459,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 4 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -89066,25 +121494,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -89127,12 +121557,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -89143,31 +121575,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 559 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 749 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89179,7 +121611,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89187,40 +121619,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89233,19 +121665,21 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 @@ -89295,12 +121729,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -89311,35 +121747,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 560 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 + SolutionIndex: 750 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -89347,7 +121783,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89355,7 +121791,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -89363,32 +121799,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89401,26 +121837,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -89463,12 +121901,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -89479,31 +121919,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 561 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM8 + SolutionIndex: 751 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89515,7 +121955,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89523,7 +121963,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -89531,63 +121971,65 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -89631,12 +122073,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -89647,31 +122091,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 562 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 752 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89690,31 +122134,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 LVPA: 4 LVPB: 16 LdcEqualsLdd: false @@ -89730,14 +122174,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 16 MacroTileA: 64 @@ -89745,18 +122189,18 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -89799,12 +122243,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -89815,31 +122261,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 563 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM8 + SolutionIndex: 753 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89857,42 +122305,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 LVCB: 32 - LVPA: 4 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89907,23 +122351,27 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -89935,7 +122383,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -89965,12 +122413,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -89981,15 +122431,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 564 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 754 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -89997,17 +122447,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90019,54 +122467,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -90081,17 +122525,21 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -90103,7 +122551,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -90133,12 +122581,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -90149,8 +122599,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 565 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + SolutionIndex: 755 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -90165,21 +122615,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -90187,81 +122635,79 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 16 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -90273,7 +122719,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -90303,12 +122749,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -90319,31 +122767,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 566 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM1 + SolutionIndex: 756 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90381,22 +122829,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 + LSCA: 64 + LSCB: 64 LSPA: 2 LSPB: 2 - LVCA: 128 - LVCB: 128 + LVCA: 64 + LVCB: 64 LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -90410,26 +122858,28 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -90442,7 +122892,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -90471,12 +122921,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -90487,20 +122939,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 567 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 757 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -90508,14 +122960,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -90523,7 +122975,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -90543,34 +122995,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -90578,26 +123030,28 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -90610,7 +123064,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -90639,12 +123093,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -90655,31 +123111,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 568 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM1 + SolutionIndex: 758 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90699,7 +123155,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -90707,28 +123163,28 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 8 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -90747,25 +123203,27 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -90778,7 +123236,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -90807,12 +123265,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -90823,29 +123283,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 569 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 759 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -90867,32 +123327,32 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 @@ -90921,19 +123381,21 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -90946,7 +123408,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -90975,12 +123437,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -90991,28 +123455,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 570 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 760 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -91034,9 +123498,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -91047,7 +123511,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 @@ -91057,10 +123521,10 @@ LSCB: 64 LSPA: 8 LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 @@ -91089,17 +123553,21 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -91112,7 +123580,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -91141,12 +123609,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -91157,33 +123627,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 571 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 761 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -91202,9 +123670,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -91215,7 +123683,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 @@ -91225,10 +123693,10 @@ LSCB: 64 LSPA: 8 LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 @@ -91257,19 +123725,19 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -91282,7 +123750,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -91311,12 +123779,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -91327,31 +123797,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 572 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 762 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -91363,81 +123835,81 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 64 - LSPA: 32 + LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 16 - LVPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 6656 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 4608 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -91479,12 +123951,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -91495,31 +123969,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 573 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_16_2_WGM1 + SolutionIndex: 763 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -91531,81 +124007,79 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 - DirectToLds: false - DirectToLdsA: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -91617,7 +124091,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -91647,12 +124121,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -91663,31 +124139,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 574 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 764 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -91699,48 +124175,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 + LSCA: 32 + LSCB: 64 LSPA: 8 - LSPB: 8 - LVCA: 32 + LSPB: 4 + LVCA: 16 LVCB: 32 - LVPA: 2 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -91753,27 +124225,29 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -91785,7 +124259,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -91815,12 +124289,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -91831,31 +124307,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 575 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW4_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 765 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -91867,14 +124343,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -91895,26 +124371,26 @@ KernelLanguage: Assembly LSCA: 32 LSCB: 64 - LSPA: 16 - LSPB: 8 + LSPA: 8 + LSPB: 4 LVCA: 16 LVCB: 32 - LVPA: 8 - LVPB: 4 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -91929,19 +124405,19 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -91983,12 +124459,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -91999,8 +124477,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 576 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 + SolutionIndex: 766 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92020,10 +124498,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92041,75 +124521,71 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 LVPA: 4 - LVPB: 16 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -92121,7 +124597,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -92151,12 +124627,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -92167,31 +124645,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 577 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM1 + SolutionIndex: 767 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL1_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92203,79 +124683,79 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 - DirectToLds: false + DepthU: 8 + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false + DirectToLdsB: true DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 32 + LVCB: 128 LVPA: 4 - LVPB: 8 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprB: true LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularB: 8 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -92287,7 +124767,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -92317,12 +124797,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -92333,33 +124815,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 578 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + SolutionIndex: 768 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x128x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92371,7 +124851,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -92379,73 +124859,75 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 8 - LSPB: 16 - LVCA: 32 + LSPB: 8 + LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -92458,7 +124940,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -92487,12 +124969,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -92503,31 +124987,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 579 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + SolutionIndex: 769 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] + VectorWidth: 4 + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92539,14 +125023,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -92565,28 +125049,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -92594,24 +125078,28 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -92624,7 +125112,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -92653,12 +125141,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -92669,20 +125159,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 580 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 + SolutionIndex: 770 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -92690,12 +125180,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92707,7 +125195,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -92715,36 +125203,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -92754,32 +125242,34 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -92792,7 +125282,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -92821,12 +125311,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -92837,31 +125329,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 581 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 771 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -92875,81 +125367,81 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularB: 4 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -92991,12 +125483,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -93007,31 +125501,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 582 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + SolutionIndex: 772 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [8, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93043,54 +125539,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 32 + LVPA: 2 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -93098,26 +125594,26 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -93130,7 +125626,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -93159,12 +125655,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -93175,31 +125673,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 583 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 773 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_8_USFGRO0_VW4_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93211,7 +125711,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -93237,28 +125737,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -93266,25 +125766,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -93298,7 +125800,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -93327,31 +125829,35 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 584 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_8_4_WGM1 + SolutionIndex: 774 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -93364,14 +125870,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -93399,7 +125905,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -93441,12 +125947,14 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -93466,7 +125974,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -93495,24 +126003,28 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 585 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 775 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -93531,9 +126043,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -93553,42 +126065,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -93609,18 +126117,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -93633,7 +126143,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -93663,24 +126173,28 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 586 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 776 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -93695,11 +126209,11 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -93723,7 +126237,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -93731,28 +126245,28 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -93762,33 +126276,35 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -93831,44 +126347,48 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 587 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG16_8_2_WGM8 + SolutionIndex: 777 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -93883,7 +126403,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -93891,46 +126411,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 LVPA: 4 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -93939,24 +126459,26 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -93999,47 +126521,51 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 588 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + SolutionIndex: 778 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94071,28 +126597,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 128 LSPA: 4 - LSPB: 4 + LSPB: 2 LVCA: 64 - LVCB: 64 + LVCB: 128 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -94107,24 +126633,26 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -94138,7 +126666,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -94167,24 +126695,28 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 589 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 779 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -94193,19 +126725,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -94219,7 +126751,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -94227,46 +126759,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -94274,25 +126806,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -94306,7 +126840,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -94335,47 +126869,51 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 590 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 780 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94387,7 +126925,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -94414,27 +126952,27 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 128 LSPA: 8 - LSPB: 16 + LSPB: 4 LVCA: 32 - LVCB: 16 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -94443,22 +126981,24 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -94472,7 +127012,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -94501,47 +127041,51 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 591 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + SolutionIndex: 781 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -94555,48 +127099,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -94609,26 +127153,26 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -94671,24 +127215,28 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 592 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 782 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -94697,21 +127245,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94723,7 +127273,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -94731,46 +127281,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -94778,23 +127328,25 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -94837,47 +127389,51 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 593 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 + SolutionIndex: 783 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -94891,54 +127447,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 64 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -94946,22 +127498,26 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 @@ -94975,7 +127531,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -95005,49 +127561,51 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 594 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + SolutionIndex: 784 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95059,7 +127617,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -95085,55 +127643,57 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 + LSPA: 8 + LSPB: 8 + LVCA: 16 LVCB: 16 - LVPA: 8 - LVPB: 4 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -95146,7 +127706,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -95175,47 +127735,51 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 595 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + SolutionIndex: 785 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95227,7 +127791,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -95235,46 +127799,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 - LSPA: 16 + LSPA: 8 LSPB: 8 LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 4 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -95282,26 +127846,28 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -95314,7 +127880,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -95343,30 +127909,34 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 596 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_8_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 786 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 8] @@ -95379,11 +127949,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95395,81 +127965,79 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 - DirectToLds: false - DirectToLdsA: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -95481,7 +128049,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -95511,30 +128079,34 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 597 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 787 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [8, 4] @@ -95543,15 +128115,15 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95563,7 +128135,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -95571,19 +128143,19 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 @@ -95593,31 +128165,31 @@ LSCB: 64 LSPA: 8 LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -95625,19 +128197,21 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -95650,7 +128224,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -95679,24 +128253,28 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 598 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_16_2_WGM8 + SolutionIndex: 788 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -95715,11 +128293,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95731,54 +128309,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 8 - LSPB: 16 - LVCA: 32 + LSPB: 8 + LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -95787,25 +128365,25 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -95818,7 +128396,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -95847,31 +128425,35 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 599 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 789 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 @@ -95883,11 +128465,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - [2, 3, 0, 1] - - - [1024, 128, 1, 128] - [4, 1028.02] @@ -97961,8 +130545,6 @@ - [100, 9061.26] - - [49, 2048, 128, 512] - [98, 6963.26] - - - [784, 512, 64, 128] - - [100, 8822.52] - - [784, 128, 128, 512] - [107, 8983.53] - - [196, 256, 64, 1024] @@ -97973,36 +130555,22 @@ - [99, 8581.25] - - [49, 2048, 256, 512] - [98, 7049.54] - - - [196, 1024, 64, 256] - - [101, 7953.59] - - [784, 128, 256, 512] - [109, 9102.89] - - [196, 256, 128, 1024] - [101, 8085.79] - - - [3136, 64, 64, 256] - - [105, 9266.03] - - - [784, 128, 64, 512] - - [106, 8809.29] - - - [49, 2048, 64, 512] - - [98, 6843.85] - - [3136, 64, 128, 256] - [105, 9381.29] - - [3136, 256, 128, 64] - [103, 8982.54] - - [784, 512, 128, 128] - [100, 8965.89] - - - [3136, 256, 64, 64] - - [103, 8879.7] - - [3136, 64, 256, 256] - [105, 9566.33] - - - [3136, 64, 64, 64] - - [104, 8313.95] - - [3136, 64, 256, 64] - [99, 8743.7] - - [196, 1024, 128, 256] - [102, 8119.33] - - - [49, 512, 64, 2048] - - [110, 7055.31] - - [49, 512, 256, 2048] - [111, 7166.31] - - [196, 1024, 256, 256] @@ -100875,4990 +133443,5482 @@ - [280, 4689.35] - - [64, 64, 36, 1760] - [230, 5622.24] + - - [196, 528, 32, 32] + - [313, 4088.41] + - - [5329, 64, 32, 80] + - [306, 8331.14] + - - [64, 2880, 1, 320] + - [357, 4362.6] + - - [49, 832, 32, 256] + - [320, 5618.63] + - - [3136, 64, 64, 64] + - [306, 8457.65] + - - [196, 512, 32, 24] + - [307, 3621.73] + - - [289, 1120, 1, 160] + - [303, 3302.86] + - - [1225, 192, 32, 32] + - [311, 6194.57] + - - [64, 2048, 32, 384] + - [334, 9541.54] + - - [1001, 1536, 1, 32] + - [305, 3575.67] + - - [289, 1792, 1, 320] + - [328, 5140.33] + - - [3136, 256, 64, 64] + - [329, 9310.12] + - - [1001, 1024, 1, 32] + - [300, 2733.4] + - - [196, 480, 32, 64] + - [361, 5070.42] + - - [64, 1728, 1, 320] + - [358, 3205.57] + - - [49, 832, 32, 160] + - [362, 4988.82] + - - [49, 2048, 64, 512] + - [332, 7370.31] + - - [49, 832, 32, 384] + - [320, 5901.95] + - - [289, 896, 1, 192] + - [346, 3452.59] + - - [289, 1024, 32, 384] + - [365, 8902.42] + - - [784, 192, 32, 96] + - [376, 7853.63] + - - [50176, 256, 1, 128] + - [339, 9041.83] + - - [289, 1024, 32, 256] + - [374, 8660.72] + - - [289, 1024, 32, 192] + - [363, 8433.35] + - - [12544, 512, 1, 256] + - [323, 9187.34] + - - [1225, 1728, 1, 192] + - [327, 7720.85] + - - [196, 480, 32, 96] + - [372, 5662.5] + - - [196, 512, 32, 144] + - [366, 6531.38] + - - [784, 400, 1, 32] + - [301, 1280.0] + - - [289, 768, 32, 128] + - [367, 7913.61] + - - [5329, 576, 1, 96] + - [310, 7563.46] + - - [49, 1200, 1, 128] + - [354, 1011.61] + - - [64, 1536, 32, 256] + - [368, 9159.54] + - - [289, 2592, 1, 384] + - [336, 6002.71] + - - [196, 528, 32, 128] + - [371, 5987.1] + - - [64, 2048, 32, 448] + - [334, 9669.87] + - - [196, 1024, 64, 256] + - [373, 7818.94] + - - [5329, 448, 1, 64] + - [306, 6201.02] + - - [784, 256, 32, 64] + - [308, 7623.18] + - - [784, 192, 32, 32] + - [313, 5874.26] + - - [21609, 288, 1, 32] + - [326, 5296.5] + - - [784, 256, 32, 32] + - [304, 6235.46] + - - [5041, 720, 1, 192] + - [322, 8140.98] + - - [289, 2016, 1, 256] + - [319, 5404.05] + - - [196, 512, 32, 128] + - [364, 6366.82] + - - [289, 768, 32, 160] + - [366, 8253.88] + - - [64, 1536, 32, 384] + - [337, 9508.5] + - - [64, 1280, 32, 320] + - [337, 9070.73] + - - [289, 896, 1, 128] + - [347, 2917.68] + - - [289, 3456, 1, 384] + - [327, 7274.91] + - - [196, 800, 1, 64] + - [349, 1393.78] + - - [64, 1280, 32, 384] + - [333, 9225.01] + - - [64, 1344, 1, 512] + - [352, 3041.45] + - - [1001, 4096, 1, 512] + - [333, 9391.77] + - - [1225, 192, 32, 64] + - [306, 7729.29] + - - [64, 1152, 1, 384] + - [356, 2440.65] + - - [729, 1600, 1, 192] + - [318, 6827.71] + - - [289, 1344, 1, 192] + - [316, 4439.04] + - - [784, 192, 32, 16] + - [343, 3663.04] + - - [3136, 1024, 1, 2048] + - [325, 9071.77] + - - [64, 1152, 1, 448] + - [353, 2564.45] + - - [49, 832, 32, 128] + - [316, 4733.16] + - - [784, 256, 32, 128] + - [329, 8471.6] + - - [49, 800, 1, 128] + - [351, 633.535] + - - [196, 512, 32, 32] + - [313, 4354.26] + - - [1225, 384, 32, 96] + - [330, 8751.63] + - - [5041, 576, 1, 96] + - [312, 7067.63] + - - [49, 832, 32, 48] + - [345, 3316.72] + - - [3136, 64, 64, 256] + - [367, 9721.9] + - - [5329, 160, 32, 64] + - [369, 8159.84] + - - [1225, 288, 32, 48] + - [359, 6673.65] + - - [4096, 9216, 1, 512] + - [341, 10116.9] + - - [196, 480, 32, 192] + - [370, 6388.46] + - - [64, 1152, 1, 256] + - [357, 1982.6] + - - [3136, 1024, 1, 512] + - [325, 8745.57] + - - [49, 832, 32, 32] + - [344, 2717.87] + - - [784, 192, 32, 64] + - [308, 7216.32] + - - [289, 1024, 32, 128] + - [331, 7970.5] + - - [289, 768, 32, 192] + - [375, 8327.27] + - - [289, 1120, 1, 192] + - [315, 3716.9] + - - [196, 512, 32, 112] + - [321, 6252.81] + - - [1001, 2048, 1, 32] + - [309, 4000.09] + - - [1225, 288, 32, 64] + - [369, 7208.04] + - - [196, 600, 1, 64] + - [348, 1093.95] + - - [1225, 384, 32, 192] + - [330, 9332.66] + - - [50176, 256, 1, 512] + - [340, 9833.54] + - - [196, 512, 32, 160] + - [367, 6614.34] + - - [4096, 4096, 1, 512] + - [338, 10032.2] + - - [49, 832, 32, 192] + - [316, 5244.53] + - - [1225, 256, 32, 64] + - [306, 7972.35] + - - [64, 2048, 32, 320] + - [334, 9404.27] + - - [196, 480, 32, 16] + - [360, 2724.49] + - - [1225, 256, 32, 48] + - [308, 7100.38] + - - [64, 1280, 32, 448] + - [333, 9344.41] + - - [1225, 1200, 1, 64] + - [302, 5157.89] + - - [1225, 384, 32, 64] + - [306, 8219.96] + - - [12544, 512, 1, 1024] + - [325, 9672.72] + - - [64, 1280, 32, 192] + - [321, 8525.01] + - - [196, 512, 32, 64] + - [306, 5489.34] + - - [289, 1792, 1, 256] + - [324, 4831.61] + - - [196, 528, 32, 256] + - [342, 6453.82] + - - [49, 512, 64, 2048] + - [377, 7548.98] + - - [64, 2048, 32, 192] + - [329, 8955.81] + - - [784, 512, 64, 128] + - [329, 9160.73] + - - [784, 128, 64, 512] + - [336, 9280.69] + - - [196, 528, 32, 160] + - [370, 6161.15] + - - [1225, 192, 32, 48] + - [306, 7236.92] + - - [64, 1728, 1, 192] + - [356, 2480.57] + - - [1001, 2048, 1, 64] + - [382, 5714.42] + - - [5329, 64, 128, 80] + - [389, 8835.29] + - - [64, 1280, 128, 448] + - [387, 10020.5] + - - [289, 768, 128, 128] + - [390, 8542.71] + - - [1225, 192, 128, 64] + - [379, 8444.77] + - - [1225, 288, 128, 48] + - [392, 7244.66] + - - [289, 768, 128, 192] + - [394, 8794.49] + - - [289, 768, 128, 160] + - [391, 8705.33] + - - [64, 2048, 128, 192] + - [385, 9780.26] + - - [64, 1280, 128, 384] + - [388, 9950.9] + - - [1225, 256, 128, 48] + - [380, 8273.61] + - - [1225, 192, 128, 48] + - [380, 8140.32] + - - [1225, 288, 128, 64] + - [392, 7886.21] + - - [64, 1280, 128, 320] + - [384, 9894.56] + - - [1225, 256, 128, 64] + - [385, 8572.51] + - - [1001, 2048, 1, 128] + - [386, 7289.06] + - - [1225, 192, 128, 32] + - [381, 7104.57] + - - [64, 1280, 128, 192] + - [393, 9642.08] + - - [1001, 1536, 1, 64] + - [383, 5146.56] - - [1024, 128, 1, 128] - - [304, 1028.12] + - [399, 1028.12] - - [4, 704, 1, 1280] - - [343, 363.455] + - [438, 363.455] - - [4, 1856, 1, 3328] - - [343, 579.534] + - [438, 579.534] - - [1856, 448, 1, 3328] - - [380, 6966.83] + - [475, 6966.83] - - [2944, 4288, 1, 1280] - - [375, 9057.98] + - [470, 9057.98] - - [2368, 64, 1, 3328] - - [336, 5837.66] + - [431, 5837.66] - - [2368, 5888, 1, 256] - - [380, 9111.16] + - [475, 9111.16] - - [128, 64, 1, 256] - - [342, 374.591] + - [437, 374.591] - - [5888, 1024, 1, 1280] - - [385, 8570.54] + - [480, 8570.54] - - [128, 6784, 1, 3328] - - [348, 7703.96] + - [443, 7703.96] - - [64, 4, 1, 256] - - [394, 11.3219] + - [489, 11.3219] - - [5888, 1856, 1, 3328] - - [380, 9394.4] + - [475, 9394.4] - - [5056, 704, 1, 256] - - [383, 8026.99] + - [478, 8026.99] - - [5888, 2944, 1, 3328] - - [373, 7608.21] + - [468, 7608.21] - - [1856, 4288, 1, 256] - - [374, 8986.42] + - [469, 8986.42] - - [1024, 5056, 1, 128] - - [366, 3898.34] + - [461, 3898.34] - - [5056, 5056, 1, 3328] - - [374, 9536.85] + - [469, 9536.85] - - [1408, 5888, 1, 1280] - - [375, 9279.19] + - [470, 9279.19] - - [2368, 448, 1, 128] - - [367, 2474.42] + - [462, 2474.42] - - [1024, 3584, 1, 3328] - - [377, 9258.58] + - [472, 9258.58] - - [4, 2944, 1, 1280] - - [329, 611.84] + - [424, 611.84] - - [1408, 64, 1, 128] - - [300, 858.31] + - [395, 858.31] - - [256, 4288, 1, 3328] - - [380, 7616.08] + - [475, 7616.08] - - [5888, 1408, 1, 1280] - - [373, 9620.39] + - [468, 9620.39] - - [704, 1856, 1, 3328] - - [374, 9033.75] + - [469, 9033.75] - - [4, 1408, 1, 128] - - [387, 24.455] + - [482, 24.455] - - [1024, 2368, 1, 256] - - [374, 7526.25] + - [469, 7526.25] - - [1408, 1856, 1, 1280] - - [377, 8324.19] + - [472, 8324.19] - - [1408, 64, 1, 1280] - - [348, 4681.24] + - [443, 4681.24] - - [448, 1024, 1, 1280] - - [374, 7112.53] + - [469, 7112.53] - - [256, 1408, 1, 3328] - - [380, 5825.51] + - [475, 5825.51] - - [5056, 5056, 1, 1280] - - [383, 9233.65] + - [478, 9233.65] - - [448, 5056, 1, 256] - - [375, 7003.27] + - [470, 7003.27] - - [704, 1856, 1, 1280] - - [374, 8877.38] + - [469, 8877.38] - - [128, 5056, 1, 128] - - [366, 2301.14] + - [461, 2301.14] - - [2368, 128, 1, 256] - - [374, 3849.04] + - [469, 3849.04] - - [1856, 1408, 1, 128] - - [369, 4202.31] + - [464, 4202.31] - - [64, 5056, 1, 256] - - [375, 3109.62] + - [470, 3109.62] - - [6784, 256, 1, 3328] - - [374, 6388.53] + - [469, 6388.53] - - [6784, 4288, 1, 3328] - - [385, 9114.67] + - [480, 9114.67] - - [4288, 448, 1, 256] - - [378, 5783.05] + - [473, 5783.05] - - [64, 704, 1, 128] - - [311, 379.519] + - [406, 379.519] - - [1856, 2368, 1, 3328] - - [374, 9128.46] + - [469, 9128.46] - - [4288, 2944, 1, 1280] - - [380, 9182.33] + - [475, 9182.33] - - [704, 5056, 1, 1280] - - [374, 9071.57] + - [469, 9071.57] - - [2368, 704, 1, 3328] - - [380, 7731.43] + - [475, 7731.43] - - [256, 5888, 1, 256] - - [374, 7920.38] + - [469, 7920.38] - - [1856, 4288, 1, 3328] - - [380, 9330.07] + - [475, 9330.07] - - [256, 2944, 1, 256] - - [381, 5312.27] + - [476, 5312.27] - - [5888, 1024, 1, 256] - - [372, 6710.97] + - [467, 6710.97] - - [448, 64, 1, 1280] - - [347, 2814.53] + - [442, 2814.53] - - [448, 5056, 1, 3328] - - [374, 8255.53] + - [469, 8255.53] - - [3584, 4, 1, 1280] - - [323, 640.815] + - [418, 640.815] - - [2944, 64, 1, 256] - - [322, 2621.54] + - [417, 2621.54] - - [128, 4, 1, 1280] - - [394, 86.3316] + - [489, 86.3316] - - [1408, 2944, 1, 256] - - [374, 8848.99] + - [469, 8848.99] - - [256, 1856, 1, 1280] - - [374, 7366.55] + - [469, 7366.55] - - [6784, 5056, 1, 3328] - - [385, 8332.16] + - [480, 8332.16] - - [5056, 5056, 1, 256] - - [380, 9171.74] + - [475, 9171.74] - - [1408, 6784, 1, 128] - - [366, 5079.19] + - [461, 5079.19] - - [64, 1024, 1, 1280] - - [338, 3679.31] + - [433, 3679.31] - - [2944, 4, 1, 256] - - [329, 369.543] + - [424, 369.543] - - [704, 5056, 1, 128] - - [366, 4509.27] + - [461, 4509.27] - - [4, 2368, 1, 1280] - - [323, 569.844] + - [418, 569.844] - - [2368, 2944, 1, 1280] - - [385, 7451.14] + - [480, 7451.14] - - [128, 3584, 1, 1280] - - [383, 6071.26] + - [478, 6071.26] - - [6784, 6784, 1, 1280] - - [380, 9535.74] + - [475, 9535.74] - - [1408, 4288, 1, 1280] - - [383, 8255.09] + - [478, 8255.09] - - [3584, 4288, 1, 1280] - - [385, 9651.19] + - [480, 9651.19] - - [2368, 704, 1, 1280] - - [380, 8291.4] + - [475, 8291.4] - - [5056, 4288, 1, 3328] - - [372, 9406.36] + - [467, 9406.36] - - [3584, 2368, 1, 3328] - - [380, 9350.32] + - [475, 9350.32] - - [64, 704, 1, 1280] - - [347, 3384.59] + - [442, 3384.59] - - [4288, 256, 1, 256] - - [380, 5593.62] + - [475, 5593.62] - - [2944, 128, 1, 128] - - [302, 2130.6] + - [397, 2130.6] - - [6784, 448, 1, 1280] - - [383, 8815.85] + - [478, 8815.85] - - [1408, 2944, 1, 128] - - [366, 4558.34] + - [461, 4558.34] - - [4288, 2944, 1, 256] - - [385, 7865.43] + - [480, 7865.43] - - [5888, 704, 1, 1280] - - [374, 9262.99] + - [469, 9262.99] - - [1856, 64, 1, 1280] - - [348, 4359.15] + - [443, 4359.15] - - [448, 5888, 1, 128] - - [369, 4000.59] + - [464, 4000.59] - - [5888, 64, 1, 3328] - - [349, 6603.39] + - [444, 6603.39] - - [2944, 256, 1, 3328] - - [374, 8423.63] + - [469, 8423.63] - - [1024, 64, 1, 128] - - [319, 582.642] + - [414, 582.642] - - [5056, 2368, 1, 1280] - - [374, 9419.91] + - [469, 9419.91] - - [448, 3584, 1, 1280] - - [374, 7985.82] + - [469, 7985.82] - - [6784, 5888, 1, 256] - - [372, 9494.36] + - [467, 9494.36] - - [704, 1024, 1, 128] - - [366, 2813.35] + - [461, 2813.35] - - [704, 128, 1, 1280] - - [348, 4477.71] + - [443, 4477.71] - - [5888, 2944, 1, 128] - - [369, 4745.96] + - [464, 4745.96] - - [4, 3584, 1, 128] - - [386, 96.479] + - [481, 96.479] - - [1408, 448, 1, 1280] - - [374, 6912.8] + - [469, 6912.8] - - [1024, 1408, 1, 256] - - [382, 5810.85] + - [477, 5810.85] - - [2368, 2368, 1, 3328] - - [383, 9088.71] + - [478, 9088.71] - - [1856, 6784, 1, 128] - - [369, 5168.32] + - [464, 5168.32] - - [5056, 704, 1, 3328] - - [375, 7464.9] + - [470, 7464.9] - - [1408, 1856, 1, 256] - - [380, 6727.69] + - [475, 6727.69] - - [1408, 704, 1, 3328] - - [380, 8379.53] + - [475, 8379.53] - - [2368, 5056, 1, 256] - - [380, 8664.11] + - [475, 8664.11] - - [5888, 1856, 1, 256] - - [385, 5810.02] + - [480, 5810.02] - - [4288, 64, 1, 3328] - - [362, 6583.94] + - [457, 6583.94] - - [2368, 4, 1, 1280] - - [395, 545.251] + - [490, 545.251] - - [704, 5888, 1, 256] - - [380, 8813.71] + - [475, 8813.71] - - [4288, 64, 1, 256] - - [338, 3059.97] + - [433, 3059.97] - - [6784, 64, 1, 256] - - [380, 3490.96] + - [475, 3490.96] - - [2944, 256, 1, 256] - - [374, 6970.4] + - [469, 6970.4] - - [2944, 6784, 1, 3328] - - [374, 9475.79] + - [469, 9475.79] - - [704, 1408, 1, 3328] - - [374, 8154.18] + - [469, 8154.18] - - [3584, 704, 1, 3328] - - [374, 8995.07] + - [469, 8995.07] - - [2944, 256, 1, 128] - - [366, 2824.13] + - [461, 2824.13] - - [6784, 4, 1, 1280] - - [323, 625.714] + - [418, 625.714] - - [1024, 64, 1, 1280] - - [335, 3307.91] + - [430, 3307.91] - - [448, 4288, 1, 256] - - [380, 6074.48] + - [475, 6074.48] - - [64, 3584, 1, 3328] - - [328, 6200.26] + - [423, 6200.26] - - [704, 2368, 1, 1280] - - [374, 8291.4] + - [469, 8291.4] - - [448, 2944, 1, 128] - - [366, 3221.87] + - [461, 3221.87] - - [1856, 2368, 1, 1280] - - [385, 6855.24] + - [480, 6855.24] - - [2368, 128, 1, 3328] - - [336, 6479.61] + - [431, 6479.61] - - [2944, 128, 1, 256] - - [374, 3828.23] + - [469, 3828.23] - - [448, 1408, 1, 256] - - [375, 4525.9] + - [470, 4525.9] - - [1856, 4288, 1, 1280] - - [373, 9160.32] + - [468, 9160.32] - - [64, 5056, 1, 3328] - - [356, 6819.3] + - [451, 6819.3] - - [4, 704, 1, 256] - - [340, 123.541] + - [435, 123.541] - - [1024, 448, 1, 128] - - [369, 1989.27] + - [464, 1989.27] - - [704, 4, 1, 1280] - - [343, 381.931] + - [438, 381.931] - - [704, 256, 1, 128] - - [366, 1109.17] + - [461, 1109.17] - - [704, 2944, 1, 128] - - [366, 4089.03] + - [461, 4089.03] - - [1408, 1024, 1, 1280] - - [380, 8192.08] + - [475, 8192.08] - - [704, 6784, 1, 256] - - [374, 6717.9] + - [469, 6717.9] - - [6784, 704, 1, 256] - - [380, 5429.22] + - [475, 5429.22] - - [5056, 1408, 1, 128] - - [366, 4954.5] + - [461, 4954.5] - - [256, 3584, 1, 3328] - - [374, 7890.96] + - [469, 7890.96] - - [4, 5888, 1, 3328] - - [391, 691.047] + - [486, 691.047] - - [128, 1408, 1, 128] - - [313, 1393.14] + - [408, 1393.14] - - [3584, 4288, 1, 3328] - - [376, 8900.87] + - [471, 8900.87] - - [5888, 1856, 1, 1280] - - [377, 9345.85] + - [472, 9345.85] - - [5056, 1024, 1, 3328] - - [378, 7834.84] + - [473, 7834.84] - - [5056, 64, 1, 1280] - - [356, 5890.14] + - [451, 5890.14] - - [1024, 704, 1, 256] - - [374, 6007.57] + - [469, 6007.57] - - [1024, 4288, 1, 128] - - [368, 3497.09] + - [463, 3497.09] - - [4288, 64, 1, 1280] - - [353, 4726.59] + - [448, 4726.59] - - [2368, 3584, 1, 1280] - - [372, 8128.82] + - [467, 8128.82] - - [2368, 6784, 1, 1280] - - [372, 9478.72] + - [467, 9478.72] - - [1024, 256, 1, 256] - - [380, 4092.1] + - [475, 4092.1] - - [1856, 4, 1, 1280] - - [395, 509.903] + - [490, 509.903] - - [448, 448, 1, 256] - - [380, 3001.28] + - [475, 3001.28] - - [2944, 3584, 1, 3328] - - [381, 9081.91] + - [476, 9081.91] - - [128, 4288, 1, 128] - - [301, 2323.33] + - [396, 2323.33] - - [64, 448, 1, 256] - - [344, 1066.97] + - [439, 1066.97] - - [128, 1024, 1, 3328] - - [357, 6392.36] + - [452, 6392.36] - - [4, 1408, 1, 3328] - - [340, 616.656] + - [435, 616.656] - - [6784, 2944, 1, 256] - - [383, 8547.73] + - [478, 8547.73] - - [64, 1856, 1, 1280] - - [356, 4409.71] + - [451, 4409.71] - - [64, 1024, 1, 128] - - [300, 554.902] + - [395, 554.902] - - [4288, 2368, 1, 3328] - - [376, 8780.08] + - [471, 8780.08] - - [1856, 2368, 1, 256] - - [383, 4976.74] + - [478, 4976.74] - - [3584, 256, 1, 128] - - [368, 2812.37] + - [463, 2812.37] - - [3584, 6784, 1, 3328] - - [378, 9278.22] + - [473, 9278.22] - - [256, 1024, 1, 256] - - [374, 4346.53] + - [469, 4346.53] - - [4, 6784, 1, 3328] - - [393, 681.366] + - [488, 681.366] - - [1024, 5888, 1, 3328] - - [374, 9187.61] + - [469, 9187.61] - - [1024, 128, 1, 1280] - - [326, 3660.05] + - [421, 3660.05] - - [4288, 128, 1, 1280] - - [380, 6019.17] + - [475, 6019.17] - - [5056, 4288, 1, 1280] - - [372, 9343.96] + - [467, 9343.96] - - [5888, 64, 1, 256] - - [374, 4692.17] + - [469, 4692.17] - - [1856, 256, 1, 1280] - - [380, 4790.38] + - [475, 4790.38] - - [64, 5888, 1, 3328] - - [348, 6702.2] + - [443, 6702.2] - - [2944, 5888, 1, 128] - - [369, 5202.65] + - [464, 5202.65] - - [704, 5888, 1, 1280] - - [374, 9264.29] + - [469, 9264.29] - - [2368, 3584, 1, 128] - - [366, 5053.71] + - [461, 5053.71] - - [6784, 5888, 1, 3328] - - [372, 7926.8] + - [467, 7926.8] - - [704, 1024, 1, 1280] - - [373, 5402.6] + - [468, 5402.6] - - [448, 256, 1, 3328] - - [356, 6124.65] + - [451, 6124.65] - - [448, 1856, 1, 128] - - [367, 2885.96] + - [462, 2885.96] - - [128, 1024, 1, 128] - - [301, 1013.22] + - [396, 1013.22] - - [2944, 4, 1, 128] - - [386, 77.6374] + - [481, 77.6374] - - [1024, 704, 1, 1280] - - [374, 7365.58] + - [469, 7365.58] - - [128, 5888, 1, 256] - - [374, 6990.61] + - [469, 6990.61] - - [1024, 5056, 1, 1280] - - [379, 9422.0] + - [474, 9422.0] - - [4288, 1024, 1, 256] - - [381, 6270.03] + - [476, 6270.03] - - [2944, 2368, 1, 128] - - [366, 4918.18] + - [461, 4918.18] - - [704, 704, 1, 3328] - - [374, 7963.65] + - [469, 7963.65] - - [704, 1408, 1, 1280] - - [374, 8347.32] + - [469, 8347.32] - - [5888, 448, 1, 1280] - - [380, 5217.05] + - [475, 5217.05] - - [3584, 256, 1, 3328] - - [374, 7802.25] + - [469, 7802.25] - - [704, 5888, 1, 3328] - - [380, 8381.46] + - [475, 8381.46] - - [704, 1856, 1, 128] - - [366, 3598.38] + - [461, 3598.38] - - [128, 3584, 1, 3328] - - [336, 7161.11] + - [431, 7161.11] - - [6784, 2368, 1, 1280] - - [385, 9464.41] + - [480, 9464.41] - - [4, 4288, 1, 128] - - [386, 132.68] + - [481, 132.68] - - [128, 704, 1, 1280] - - [348, 4463.85] + - [443, 4463.85] - - [3584, 2944, 1, 256] - - [385, 8201.24] + - [480, 8201.24] - - [1856, 128, 1, 3328] - - [327, 6575.5] + - [422, 6575.5] - - [4, 64, 1, 1280] - - [343, 43.6745] + - [438, 43.6745] - - [4, 5056, 1, 3328] - - [323, 675.315] + - [418, 675.315] - - [128, 2944, 1, 1280] - - [327, 5916.99] + - [422, 5916.99] - - [2368, 1024, 1, 3328] - - [380, 8646.84] + - [475, 8646.84] - - [128, 256, 1, 3328] - - [361, 4130.85] + - [456, 4130.85] - - [1408, 5056, 1, 3328] - - [379, 9529.75] + - [474, 9529.75] - - [1856, 1856, 1, 3328] - - [378, 8114.99] + - [473, 8114.99] - - [3584, 128, 1, 256] - - [374, 5603.18] + - [469, 5603.18] - - [448, 1408, 1, 3328] - - [374, 7073.03] + - [469, 7073.03] - - [2368, 2368, 1, 256] - - [381, 7648.76] + - [476, 7648.76] - - [4288, 4288, 1, 1280] - - [376, 9244.11] + - [471, 9244.11] - - [64, 448, 1, 1280] - - [347, 2885.33] + - [442, 2885.33] - - [1408, 4288, 1, 256] - - [374, 8080.41] + - [469, 8080.41] - - [448, 4, 1, 256] - - [392, 84.4294] + - [487, 84.4294] - - [5888, 448, 1, 128] - - [369, 3540.8] + - [464, 3540.8] - - [448, 4, 1, 1280] - - [343, 322.257] + - [438, 322.257] - - [704, 6784, 1, 3328] - - [373, 8613.58] + - [468, 8613.58] - - [5888, 5888, 1, 1280] - - [380, 9502.05] + - [475, 9502.05] - - [5056, 1024, 1, 1280] - - [383, 9110.11] + - [478, 9110.11] - - [448, 5888, 1, 3328] - - [374, 8586.43] + - [469, 8586.43] - - [128, 4, 1, 128] - - [386, 4.27959] + - [481, 4.27959] - - [1024, 2944, 1, 1280] - - [382, 7096.53] + - [477, 7096.53] - - [5056, 5888, 1, 1280] - - [373, 9693.51] + - [468, 9693.51] - - [4288, 5888, 1, 128] - - [366, 5406.46] + - [461, 5406.46] - - [256, 3584, 1, 256] - - [374, 6908.37] + - [469, 6908.37] - - [1408, 3584, 1, 128] - - [366, 4645.69] + - [461, 4645.69] - - [256, 2944, 1, 3328] - - [377, 6284.4] + - [472, 6284.4] - - [448, 3584, 1, 128] - - [369, 3675.37] + - [464, 3675.37] - - [5888, 2944, 1, 1280] - - [379, 9628.9] + - [474, 9628.9] - - [4, 6784, 1, 1280] - - [323, 688.176] + - [418, 688.176] - - [2368, 5888, 1, 128] - - [366, 5273.96] + - [461, 5273.96] - - [64, 2944, 1, 128] - - [310, 1316.54] + - [405, 1316.54] - - [3584, 5888, 1, 256] - - [380, 9239.14] + - [475, 9239.14] - - [2368, 704, 1, 128] - - [369, 3537.65] + - [464, 3537.65] - - [3584, 2944, 1, 1280] - - [374, 9324.62] + - [469, 9324.62] - - [3584, 2368, 1, 128] - - [366, 4766.34] + - [461, 4766.34] - - [5056, 704, 1, 128] - - [366, 4487.95] + - [461, 4487.95] - - [448, 2368, 1, 128] - - [369, 2877.02] + - [464, 2877.02] - - [5056, 1408, 1, 3328] - - [385, 9515.97] + - [480, 9515.97] - - [1408, 704, 1, 256] - - [377, 6836.18] + - [472, 6836.18] - - [6784, 1024, 1, 3328] - - [372, 9309.65] + - [467, 9309.65] - - [6784, 2944, 1, 3328] - - [373, 9536.58] + - [468, 9536.58] - - [2944, 5056, 1, 3328] - - [374, 9526.25] + - [469, 9526.25] - - [1856, 1856, 1, 256] - - [374, 5239.24] + - [469, 5239.24] - - [1024, 5888, 1, 128] - - [366, 4006.28] + - [461, 4006.28] - - [2048, 7133, 1, 2048] - - [372, 9828.07] + - [467, 9828.07] - - [256, 4, 1, 128] - - [387, 4.38908] + - [482, 4.38908] - - [4288, 5888, 1, 1280] - - [382, 9202.83] + - [477, 9202.83] - - [4288, 4288, 1, 256] - - [377, 5521.18] + - [472, 5521.18] - - [448, 2944, 1, 3328] - - [380, 7724.53] + - [475, 7724.53] - - [4288, 1856, 1, 1280] - - [380, 8826.34] + - [475, 8826.34] - - [1856, 2944, 1, 3328] - - [374, 9194.9] + - [469, 9194.9] - - [256, 6784, 1, 3328] - - [374, 8740.33] + - [469, 8740.33] - - [64, 5888, 1, 256] - - [374, 4766.35] + - [469, 4766.35] - - [256, 5056, 1, 128] - - [366, 2937.6] + - [461, 2937.6] - - [5056, 1024, 1, 256] - - [385, 5467.91] + - [480, 5467.91] - - [704, 64, 1, 3328] - - [362, 4818.43] + - [457, 4818.43] - - [5056, 1856, 1, 3328] - - [379, 8861.69] + - [474, 8861.69] - - [4, 2944, 1, 3328] - - [329, 662.102] + - [424, 662.102] - - [4, 5056, 1, 256] - - [389, 494.121] + - [484, 494.121] - - [1856, 1408, 1, 256] - - [374, 8674.78] + - [469, 8674.78] - - [3584, 4, 1, 128] - - [386, 108.296] + - [481, 108.296] - - [448, 448, 1, 3328] - - [348, 6457.4] + - [443, 6457.4] - - [6784, 128, 1, 3328] - - [341, 7256.71] + - [436, 7256.71] - - [4288, 1408, 1, 128] - - [369, 4791.76] + - [464, 4791.76] - - [4288, 5056, 1, 256] - - [374, 8560.84] + - [469, 8560.84] - - [1408, 128, 1, 1280] - - [356, 5085.79] + - [451, 5085.79] - - [5056, 256, 1, 3328] - - [377, 7284.23] + - [472, 7284.23] - - [704, 704, 1, 256] - - [374, 6171.19] + - [469, 6171.19] - - [1024, 5888, 1, 1280] - - [379, 8852.89] + - [474, 8852.89] - - [6784, 2368, 1, 128] - - [367, 4729.3] + - [462, 4729.3] - - [4, 5056, 1, 1280] - - [340, 670.046] + - [435, 670.046] - - [64, 128, 1, 256] - - [342, 369.317] + - [437, 369.317] - - [128, 1856, 1, 1280] - - [336, 5549.13] + - [431, 5549.13] - - [5056, 3584, 1, 256] - - [380, 7115.84] + - [475, 7115.84] - - [1856, 1024, 1, 1280] - - [372, 8196.5] + - [467, 8196.5] - - [6784, 4288, 1, 1280] - - [373, 9509.66] + - [468, 9509.66] - - [1856, 1856, 1, 1280] - - [375, 5791.99] + - [470, 5791.99] - - [6784, 2944, 1, 128] - - [366, 5317.12] + - [461, 5317.12] - - [1408, 5056, 1, 1280] - - [375, 8980.73] + - [470, 8980.73] - - [4, 2368, 1, 3328] - - [340, 592.634] + - [435, 592.634] - - [5888, 1856, 1, 128] - - [365, 4600.2] + - [460, 4600.2] - - [448, 704, 1, 1280] - - [374, 2286.58] + - [469, 2286.58] - - [2368, 1024, 1, 128] - - [369, 3911.12] + - [464, 3911.12] - - [1024, 448, 1, 3328] - - [374, 7295.24] + - [469, 7295.24] - - [1856, 704, 1, 1280] - - [374, 8881.12] + - [469, 8881.12] - - [5056, 3584, 1, 128] - - [366, 4911.68] + - [461, 4911.68] - - [5888, 5888, 1, 3328] - - [382, 9243.9] + - [477, 9243.9] - - [6784, 1024, 1, 256] - - [385, 5475.41] + - [480, 5475.41] - - [2944, 2368, 1, 256] - - [380, 5670.77] + - [475, 5670.77] - - [256, 448, 1, 256] - - [331, 2293.86] + - [426, 2293.86] - - [5056, 5888, 1, 3328] - - [375, 7848.07] + - [470, 7848.07] - - [1856, 1024, 1, 256] - - [380, 7517.7] + - [475, 7517.7] - - [448, 1408, 1, 1280] - - [374, 6917.54] + - [469, 6917.54] - - [3584, 448, 1, 1280] - - [380, 7980.86] + - [475, 7980.86] - - [1024, 1024, 1, 1280] - - [377, 8384.52] + - [472, 8384.52] - - [448, 5888, 1, 256] - - [374, 7365.75] + - [469, 7365.75] - - [704, 64, 1, 128] - - [319, 358.755] + - [414, 358.755] - - [1408, 6784, 1, 3328] - - [380, 9094.19] + - [475, 9094.19] - - [448, 1024, 1, 128] - - [369, 1773.05] + - [464, 1773.05] - - [4288, 704, 1, 128] - - [366, 4355.38] + - [461, 4355.38] - - [128, 1856, 1, 128] - - [305, 1610.73] + - [400, 1610.73] - - [448, 2368, 1, 3328] - - [380, 7366.47] + - [475, 7366.47] - - [5056, 64, 1, 128] - - [305, 2157.33] + - [400, 2157.33] - - [5056, 2944, 1, 256] - - [374, 9123.16] + - [469, 9123.16] - - [6784, 5888, 1, 128] - - [365, 5285.9] + - [460, 5285.9] - - [704, 1024, 1, 256] - - [380, 6667.35] + - [475, 6667.35] - - [1024, 4, 1, 256] - - [329, 187.346] + - [424, 187.346] - - [2368, 1856, 1, 256] - - [380, 6777.94] + - [475, 6777.94] - - [128, 6784, 1, 1280] - - [377, 7052.71] + - [472, 7052.71] - - [1408, 3584, 1, 3328] - - [381, 9038.05] + - [476, 9038.05] - - [2368, 6784, 1, 256] - - [374, 9181.45] + - [469, 9181.45] - - [5056, 1408, 1, 1280] - - [379, 9422.0] + - [474, 9422.0] - - [256, 256, 1, 128] - - [311, 543.404] + - [406, 543.404] - - [5056, 4288, 1, 128] - - [369, 5340.02] + - [464, 5340.02] - - [1408, 1856, 1, 128] - - [366, 4270.99] + - [461, 4270.99] - - [1408, 5888, 1, 3328] - - [378, 9034.89] + - [473, 9034.89] - - [1856, 256, 1, 256] - - [380, 5847.93] + - [475, 5847.93] - - [6784, 6784, 1, 256] - - [373, 9624.48] + - [468, 9624.48] - - [64, 256, 1, 128] - - [312, 146.549] + - [407, 146.549] - - [4288, 2368, 1, 128] - - [365, 3897.04] + - [460, 3897.04] - - [1856, 4288, 1, 128] - - [366, 4337.17] + - [461, 4337.17] - - [256, 4288, 1, 1280] - - [374, 7499.52] + - [469, 7499.52] - - [2368, 2944, 1, 256] - - [379, 7703.28] + - [474, 7703.28] - - [4, 1856, 1, 256] - - [392, 264.064] + - [487, 264.064] - - [3584, 1856, 1, 1280] - - [374, 9224.43] + - [469, 9224.43] - - [6784, 6784, 1, 128] - - [366, 5476.13] + - [461, 5476.13] - - [256, 1856, 1, 128] - - [369, 1858.82] + - [464, 1858.82] - - [704, 64, 1, 1280] - - [347, 3368.77] + - [442, 3368.77] - - [5888, 5056, 1, 256] - - [380, 5859.91] + - [475, 5859.91] - - [3584, 448, 1, 256] - - [380, 7298.43] + - [475, 7298.43] - - [448, 4288, 1, 128] - - [366, 3813.55] + - [461, 3813.55] - - [2944, 4288, 1, 3328] - - [375, 9149.73] + - [470, 9149.73] - - [256, 6784, 1, 256] - - [374, 7984.95] + - [469, 7984.95] - - [1408, 4288, 1, 128] - - [369, 4728.44] + - [464, 4728.44] - - [2944, 704, 1, 3328] - - [380, 7149.86] + - [475, 7149.86] - - [128, 448, 1, 256] - - [346, 1699.18] + - [441, 1699.18] - - [512, 32, 1, 512] - - [346, 1127.6] + - [441, 1127.6] - - [3584, 3584, 1, 256] - - [375, 8558.11] + - [470, 8558.11] - - [448, 1408, 1, 128] - - [366, 2504.45] + - [461, 2504.45] - - [128, 256, 1, 1280] - - [347, 3216.59] + - [442, 3216.59] - - [3584, 5056, 1, 256] - - [372, 5674.45] + - [467, 5674.45] - - [6784, 128, 1, 256] - - [374, 6216.49] + - [469, 6216.49] - - [4288, 4, 1, 256] - - [390, 435.706] + - [485, 435.706] - - [64, 1408, 1, 3328] - - [348, 6186.01] + - [443, 6186.01] - - [704, 448, 1, 256] - - [380, 4005.08] + - [475, 4005.08] - - [2944, 2368, 1, 1280] - - [381, 8542.8] + - [476, 8542.8] - - [448, 64, 1, 3328] - - [361, 3835.33] + - [456, 3835.33] - - [1408, 3584, 1, 256] - - [374, 8714.63] + - [469, 8714.63] - - [3584, 4, 1, 3328] - - [329, 689.554] + - [424, 689.554] - - [6784, 3584, 1, 256] - - [379, 9271.34] + - [474, 9271.34] - - [256, 128, 1, 128] - - [312, 283.499] + - [407, 283.499] - - [704, 1408, 1, 128] - - [366, 3210.57] + - [461, 3210.57] - - [4, 2368, 1, 256] - - [392, 360.938] + - [487, 360.938] - - [2944, 448, 1, 128] - - [366, 3344.41] + - [461, 3344.41] - - [128, 1408, 1, 256] - - [374, 3186.38] + - [469, 3186.38] - - [4, 2944, 1, 256] - - [390, 384.622] + - [485, 384.622] - - [64, 128, 1, 3328] - - [343, 2103.72] + - [438, 2103.72] - - [5056, 2368, 1, 128] - - [366, 5219.76] + - [461, 5219.76] - - [2944, 2944, 1, 3328] - - [383, 9174.69] + - [478, 9174.69] - - [5056, 6784, 1, 256] - - [385, 8992.36] + - [480, 8992.36] - - [1856, 3584, 1, 128] - - [366, 4957.27] + - [461, 4957.27] - - [128, 2944, 1, 128] - - [304, 2241.48] + - [399, 2241.48] - - [1024, 704, 1, 3328] - - [384, 6545.11] + - [479, 6545.11] - - [6784, 448, 1, 256] - - [380, 5379.25] + - [475, 5379.25] - - [3584, 6784, 1, 128] - - [366, 5102.01] + - [461, 5102.01] - - [128, 4288, 1, 256] - - [374, 5211.86] + - [469, 5211.86] - - [704, 448, 1, 3328] - - [375, 4504.15] + - [470, 4504.15] - - [1024, 1024, 1, 3328] - - [377, 8009.77] + - [472, 8009.77] - - [128, 128, 1, 3328] - - [360, 3185.03] + - [455, 3185.03] - - [5056, 1856, 1, 256] - - [374, 9138.43] + - [469, 9138.43] - - [256, 128, 1, 256] - - [346, 1205.36] + - [441, 1205.36] - - [1024, 1856, 1, 256] - - [385, 6375.09] + - [480, 6375.09] - - [4288, 64, 1, 128] - - [302, 1695.43] + - [397, 1695.43] - - [256, 448, 1, 3328] - - [349, 5659.67] + - [444, 5659.67] - - [1408, 6784, 1, 1280] - - [374, 9349.2] + - [469, 9349.2] - - [3584, 3584, 1, 1280] - - [379, 9302.19] + - [474, 9302.19] - - [64, 2368, 1, 1280] - - [348, 4433.07] + - [443, 4433.07] - - [448, 2368, 1, 1280] - - [374, 7250.77] + - [469, 7250.77] - - [5888, 5888, 1, 128] - - [366, 4616.03] + - [461, 4616.03] - - [64, 6784, 1, 3328] - - [380, 6987.23] + - [475, 6987.23] - - [2944, 256, 1, 1280] - - [383, 6127.45] + - [478, 6127.45] - - [5056, 5888, 1, 128] - - [365, 5106.39] + - [460, 5106.39] - - [256, 2368, 1, 128] - - [366, 2141.23] + - [461, 2141.23] - - [5056, 2368, 1, 3328] - - [377, 9041.75] + - [472, 9041.75] - - [2944, 4288, 1, 256] - - [385, 8691.22] + - [480, 8691.22] - - [1408, 3584, 1, 1280] - - [374, 9070.0] + - [469, 9070.0] - - [2368, 64, 1, 256] - - [346, 2412.87] + - [441, 2412.87] - - [64, 448, 1, 3328] - - [361, 3739.14] + - [456, 3739.14] - - [256, 256, 1, 3328] - - [348, 5304.18] + - [443, 5304.18] - - [5888, 4, 1, 128] - - [387, 105.655] + - [482, 105.655] - - [1856, 704, 1, 256] - - [374, 8025.43] + - [469, 8025.43] - - [4, 4288, 1, 1280] - - [321, 579.07] + - [416, 579.07] - - [1408, 448, 1, 3328] - - [382, 5714.51] + - [477, 5714.51] - - [1024, 4, 1, 3328] - - [340, 608.649] + - [435, 608.649] - - [2368, 256, 1, 256] - - [380, 5173.08] + - [475, 5173.08] - - [2368, 6784, 1, 3328] - - [380, 9456.61] + - [475, 9456.61] - - [1856, 1408, 1, 1280] - - [385, 7805.19] + - [480, 7805.19] - - [1856, 448, 1, 1280] - - [372, 6185.04] + - [467, 6185.04] - - [6784, 704, 1, 128] - - [366, 4597.87] + - [461, 4597.87] - - [4, 4, 1, 256] - - [343, 0.791892] + - [438, 0.791892] - - [128, 5888, 1, 128] - - [304, 2691.76] + - [399, 2691.76] - - [1408, 5888, 1, 256] - - [379, 7164.27] + - [474, 7164.27] - - [704, 2944, 1, 1280] - - [381, 8139.81] + - [476, 8139.81] - - [1856, 2368, 1, 128] - - [369, 4623.38] + - [464, 4623.38] - - [4096, 7133, 1, 4096] - - [373, 9940.07] + - [468, 9940.07] - - [256, 64, 1, 256] - - [337, 689.953] + - [432, 689.953] - - [1024, 1024, 1, 256] - - [380, 7216.11] + - [475, 7216.11] - - [704, 1856, 1, 256] - - [380, 6364.17] + - [475, 6364.17] - - [128, 4288, 1, 3328] - - [336, 7200.59] + - [431, 7200.59] - - [3584, 704, 1, 1280] - - [383, 7972.08] + - [478, 7972.08] - - [256, 128, 1, 1280] - - [334, 2702.62] + - [429, 2702.62] - - [2368, 4, 1, 256] - - [329, 326.018] + - [424, 326.018] - - [256, 2368, 1, 1280] - - [374, 6638.93] + - [469, 6638.93] - - [2944, 6784, 1, 128] - - [365, 5233.53] + - [460, 5233.53] - - [3584, 448, 1, 3328] - - [374, 8094.4] + - [469, 8094.4] - - [1408, 4, 1, 256] - - [392, 243.646] + - [487, 243.646] - - [704, 2368, 1, 3328] - - [374, 8403.11] + - [469, 8403.11] - - [2944, 448, 1, 256] - - [374, 7022.59] + - [469, 7022.59] - - [1856, 448, 1, 128] - - [369, 2842.79] + - [464, 2842.79] - - [2368, 128, 1, 1280] - - [356, 5685.52] + - [451, 5685.52] - - [256, 5888, 1, 128] - - [371, 2178.71] + - [466, 2178.71] - - [64, 6784, 1, 256] - - [374, 5385.23] + - [469, 5385.23] - - [64, 5056, 1, 1280] - - [348, 5603.29] + - [443, 5603.29] - - [4, 6784, 1, 128] - - [386, 180.256] + - [481, 180.256] - - [2944, 2944, 1, 1280] - - [383, 9129.39] + - [478, 9129.39] - - [5888, 2368, 1, 256] - - [385, 6961.69] + - [480, 6961.69] - - [4, 3584, 1, 1280] - - [329, 646.23] + - [424, 646.23] - - [1408, 128, 1, 128] - - [315, 1172.29] + - [410, 1172.29] - - [6784, 704, 1, 3328] - - [380, 9084.62] + - [475, 9084.62] - - [128, 64, 1, 1280] - - [359, 1260.41] + - [454, 1260.41] - - [2368, 256, 1, 1280] - - [380, 6643.48] + - [475, 6643.48] - - [4, 448, 1, 3328] - - [343, 433.514] + - [438, 433.514] - - [5888, 4288, 1, 128] - - [367, 4753.17] + - [462, 4753.17] - - [4, 5888, 1, 256] - - [329, 471.14] + - [424, 471.14] - - [1408, 2944, 1, 3328] - - [383, 9207.1] + - [478, 9207.1] - - [3584, 704, 1, 128] - - [369, 3762.46] + - [464, 3762.46] - - [64, 1024, 1, 256] - - [347, 1807.99] + - [442, 1807.99] - - [5056, 5056, 1, 128] - - [370, 4830.16] + - [465, 4830.16] - - [2368, 448, 1, 1280] - - [374, 7263.16] + - [469, 7263.16] - - [128, 3584, 1, 256] - - [377, 4369.17] + - [472, 4369.17] - - [704, 448, 1, 1280] - - [375, 4205.33] + - [470, 4205.33] - - [448, 5056, 1, 128] - - [366, 3855.57] + - [461, 3855.57] - - [256, 4, 1, 1280] - - [397, 157.638] + - [492, 157.638] - - [128, 5056, 1, 256] - - [380, 6109.06] + - [475, 6109.06] - - [1408, 5056, 1, 128] - - [369, 4836.68] + - [464, 4836.68] - - [2944, 3584, 1, 128] - - [369, 4532.19] + - [464, 4532.19] - - [3584, 2368, 1, 256] - - [374, 8951.34] + - [469, 8951.34] - - [5888, 5056, 1, 1280] - - [385, 9276.49] + - [480, 9276.49] - - [2368, 5056, 1, 128] - - [369, 5167.66] + - [464, 5167.66] - - [64, 704, 1, 256] - - [329, 1501.97] + - [424, 1501.97] - - [4288, 256, 1, 1280] - - [374, 7496.3] + - [469, 7496.3] - - [3584, 3584, 1, 3328] - - [375, 9301.77] + - [470, 9301.77] - - [1024, 256, 1, 128] - - [366, 1508.84] + - [461, 1508.84] - - [4, 704, 1, 128] - - [387, 12.1469] + - [482, 12.1469] - - [5888, 6784, 1, 256] - - [373, 9370.47] + - [468, 9370.47] - - [4288, 2944, 1, 3328] - - [377, 9149.09] + - [472, 9149.09] - - [2944, 64, 1, 128] - - [313, 1456.46] + - [408, 1456.46] - - [1856, 64, 1, 256] - - [339, 2210.03] + - [434, 2210.03] - - [4288, 128, 1, 3328] - - [333, 6471.95] + - [428, 6471.95] - - [4288, 704, 1, 1280] - - [380, 8934.61] + - [475, 8934.61] - - [256, 5056, 1, 1280] - - [374, 8439.13] + - [469, 8439.13] - - [1408, 256, 1, 128] - - [369, 1769.17] + - [464, 1769.17] - - [2944, 5888, 1, 3328] - - [374, 9448.04] + - [469, 9448.04] - - [6784, 5888, 1, 1280] - - [385, 9372.25] + - [480, 9372.25] - - [704, 128, 1, 256] - - [331, 2059.8] + - [426, 2059.8] - - [5888, 4288, 1, 1280] - - [377, 9244.32] + - [472, 9244.32] - - [448, 256, 1, 1280] - - [356, 4741.72] + - [451, 4741.72] - - [5888, 3584, 1, 128] - - [365, 4980.06] + - [460, 4980.06] - - [1856, 1856, 1, 128] - - [369, 4363.98] + - [464, 4363.98] - - [5056, 4, 1, 1280] - - [389, 629.641] + - [484, 629.641] - - [256, 1408, 1, 1280] - - [380, 5588.44] + - [475, 5588.44] - - [512, 16, 1, 512] - - [340, 689.953] + - [435, 689.953] - - [704, 3584, 1, 128] - - [369, 4069.67] + - [464, 4069.67] - - [5888, 448, 1, 3328] - - [385, 7925.94] + - [480, 7925.94] - - [2368, 4288, 1, 1280] - - [384, 8492.7] + - [479, 8492.7] - - [4288, 2944, 1, 128] - - [366, 5238.21] + - [461, 5238.21] - - [1024, 6784, 1, 3328] - - [380, 8578.18] + - [475, 8578.18] - - [128, 2368, 1, 256] - - [380, 3788.9] + - [475, 3788.9] - - [6784, 64, 1, 3328] - - [374, 7003.46] + - [469, 7003.46] - - [5056, 2944, 1, 3328] - - [377, 8575.45] + - [472, 8575.45] - - [448, 128, 1, 256] - - [329, 1715.06] + - [424, 1715.06] - - [2944, 3584, 1, 256] - - [374, 8994.26] + - [469, 8994.26] - - [1408, 1408, 1, 3328] - - [372, 8757.7] + - [467, 8757.7] - - [1856, 128, 1, 1280] - - [374, 5598.17] + - [469, 5598.17] - - [3584, 3584, 1, 128] - - [365, 4787.44] + - [460, 4787.44] - - [64, 3584, 1, 256] - - [380, 3546.01] + - [475, 3546.01] - - [1408, 4, 1, 3328] - - [324, 640.24] + - [419, 640.24] - - [128, 2944, 1, 3328] - - [348, 7204.24] + - [443, 7204.24] - - [3584, 704, 1, 256] - - [374, 6239.69] + - [469, 6239.69] - - [2944, 448, 1, 3328] - - [380, 7726.71] + - [475, 7726.71] - - [3584, 1408, 1, 3328] - - [372, 9358.78] + - [467, 9358.78] - - [704, 3584, 1, 1280] - - [380, 8005.28] + - [475, 8005.28] - - [2944, 6784, 1, 1280] - - [372, 9487.73] + - [467, 9487.73] - - [1856, 6784, 1, 256] - - [374, 5684.56] + - [469, 5684.56] - - [4288, 448, 1, 3328] - - [380, 8410.38] + - [475, 8410.38] - - [6784, 4288, 1, 128] - - [370, 4785.58] + - [465, 4785.58] - - [6784, 704, 1, 1280] - - [374, 5579.05] + - [469, 5579.05] - - [256, 4288, 1, 256] - - [374, 6781.43] + - [469, 6781.43] - - [3584, 64, 1, 128] - - [313, 1474.0] + - [408, 1474.0] - - [5888, 1024, 1, 3328] - - [372, 8639.49] + - [467, 8639.49] - - [448, 64, 1, 128] - - [304, 259.282] + - [399, 259.282] - - [704, 6784, 1, 1280] - - [380, 9027.25] + - [475, 9027.25] - - [5888, 128, 1, 256] - - [380, 6812.88] + - [475, 6812.88] - - [2368, 448, 1, 3328] - - [380, 7356.63] + - [475, 7356.63] - - [1856, 5056, 1, 3328] - - [379, 8871.56] + - [474, 8871.56] - - [4, 6784, 1, 256] - - [388, 469.479] + - [483, 469.479] - - [1024, 3584, 1, 128] - - [366, 3428.02] + - [461, 3428.02] - - [1024, 1408, 1, 128] - - [369, 2935.05] + - [464, 2935.05] - - [2368, 2944, 1, 128] - - [369, 4888.02] + - [464, 4888.02] - - [5056, 64, 1, 256] - - [338, 3186.16] + - [433, 3186.16] - - [4, 448, 1, 1280] - - [343, 273.167] + - [438, 273.167] - - [5056, 2944, 1, 128] - - [370, 4752.79] + - [465, 4752.79] - - [5888, 5056, 1, 3328] - - [384, 9124.77] + - [479, 9124.77] - - [1024, 704, 1, 128] - - [369, 2302.36] + - [464, 2302.36] - - [1408, 2368, 1, 128] - - [369, 3826.95] + - [464, 3826.95] - - [5888, 2368, 1, 128] - - [366, 4912.77] + - [461, 4912.77] - - [128, 5056, 1, 3328] - - [356, 7583.8] + - [451, 7583.8] - - [3584, 6784, 1, 1280] - - [383, 9313.5] + - [478, 9313.5] - - [3072, 7435, 1, 1024] - - [377, 9322.07] + - [472, 9322.07] - - [1856, 5888, 1, 256] - - [374, 5778.34] + - [469, 5778.34] - - [256, 256, 1, 256] - - [326, 1576.91] + - [421, 1576.91] - - [256, 64, 1, 128] - - [312, 173.705] + - [407, 173.705] - - [4288, 4288, 1, 3328] - - [379, 8416.27] + - [474, 8416.27] - - [4288, 1408, 1, 1280] - - [385, 9301.97] + - [480, 9301.97] - - [3584, 5056, 1, 128] - - [371, 4344.94] + - [466, 4344.94] - - [4, 1024, 1, 3328] - - [340, 615.239] + - [435, 615.239] - - [4288, 2368, 1, 256] - - [374, 9142.67] + - [469, 9142.67] - - [2944, 5056, 1, 1280] - - [374, 9399.69] + - [469, 9399.69] - - [448, 6784, 1, 256] - - [373, 5710.93] + - [468, 5710.93] - - [64, 1024, 1, 3328] - - [356, 4975.1] + - [451, 4975.1] - - [6784, 2368, 1, 3328] - - [383, 9207.63] + - [478, 9207.63] - - [256, 1024, 1, 1280] - - [380, 5983.42] + - [475, 5983.42] - - [704, 4, 1, 128] - - [386, 15.1187] + - [481, 15.1187] - - [256, 4, 1, 256] - - [343, 52.9516] + - [438, 52.9516] - - [4288, 128, 1, 256] - - [374, 5242.98] + - [469, 5242.98] - - [4288, 1856, 1, 3328] - - [385, 9354.06] + - [480, 9354.06] - - [3584, 448, 1, 128] - - [366, 3353.9] + - [461, 3353.9] - - [256, 4, 1, 3328] - - [397, 313.324] + - [492, 313.324] - - [4, 1408, 1, 1280] - - [340, 509.207] + - [435, 509.207] - - [3584, 64, 1, 1280] - - [328, 5198.42] + - [423, 5198.42] - - [1408, 448, 1, 128] - - [366, 2628.37] + - [461, 2628.37] - - [3584, 1024, 1, 1280] - - [380, 8535.01] + - [475, 8535.01] - - [1856, 5056, 1, 256] - - [372, 8184.49] + - [467, 8184.49] - - [4, 3584, 1, 256] - - [390, 395.576] + - [485, 395.576] - - [1024, 4288, 1, 256] - - [375, 5966.52] + - [470, 5966.52] - - [5888, 3584, 1, 3328] - - [378, 9189.43] + - [473, 9189.43] - - [4, 256, 1, 256] - - [394, 41.5785] + - [489, 41.5785] - - [5056, 3584, 1, 3328] - - [379, 9431.92] + - [474, 9431.92] - - [128, 5888, 1, 1280] - - [374, 8192.1] + - [469, 8192.1] - - [704, 448, 1, 128] - - [366, 1510.96] + - [461, 1510.96] - - [2368, 1408, 1, 1280] - - [374, 8415.65] + - [469, 8415.65] - - [5056, 2944, 1, 1280] - - [385, 9294.77] + - [480, 9294.77] - - [4, 4, 1, 128] - - [387, 0.1356549] + - [482, 0.1356549] - - [3584, 256, 1, 256] - - [374, 6749.55] + - [469, 6749.55] - - [128, 1856, 1, 3328] - - [327, 6797.09] + - [422, 6797.09] - - [1024, 6784, 1, 256] - - [380, 8783.09] + - [475, 8783.09] - - [4, 128, 1, 256] - - [340, 27.4067] + - [435, 27.4067] - - [64, 64, 1, 1280] - - [359, 712.448] + - [454, 712.448] - - [6784, 4, 1, 128] - - [387, 122.06] + - [482, 122.06] - - [2944, 1408, 1, 128] - - [369, 4430.46] + - [464, 4430.46] - - [448, 128, 1, 3328] - - [356, 5097.34] + - [451, 5097.34] - - [64, 2944, 1, 3328] - - [356, 6362.2] + - [451, 6362.2] - - [64, 4288, 1, 3328] - - [356, 6565.01] + - [451, 6565.01] - - [5056, 6784, 1, 3328] - - [380, 8121.18] + - [475, 8121.18] - - [128, 2944, 1, 256] - - [374, 4692.17] + - [469, 4692.17] - - [128, 6784, 1, 128] - - [303, 2687.46] + - [398, 2687.46] - - [3584, 4288, 1, 256] - - [380, 9193.99] + - [475, 9193.99] - - [448, 1856, 1, 256] - - [380, 6231.39] + - [475, 6231.39] - - [1856, 6784, 1, 3328] - - [385, 9191.48] + - [480, 9191.48] - - [3584, 128, 1, 3328] - - [374, 7368.47] + - [469, 7368.47] - - [64, 1856, 1, 256] - - [325, 2184.63] + - [420, 2184.63] - - [1024, 448, 1, 1280] - - [380, 6977.32] + - [475, 6977.32] - - [5888, 4288, 1, 256] - - [380, 5780.5] + - [475, 5780.5] - - [4, 448, 1, 128] - - [387, 9.06] + - [482, 9.06] - - [5056, 1408, 1, 256] - - [374, 5601.35] + - [469, 5601.35] - - [64, 256, 1, 1280] - - [340, 1927.63] + - [435, 1927.63] - - [3584, 1024, 1, 256] - - [385, 7542.84] + - [480, 7542.84] - - [256, 704, 1, 256] - - [374, 2957.62] + - [469, 2957.62] - - [5888, 5888, 1, 256] - - [385, 7344.14] + - [480, 7344.14] - - [4288, 1024, 1, 1280] - - [380, 8925.84] + - [475, 8925.84] - - [5888, 128, 1, 3328] - - [374, 8410.07] + - [469, 8410.07] - - [448, 6784, 1, 3328] - - [374, 8862.56] + - [469, 8862.56] - - [2944, 1408, 1, 1280] - - [385, 7478.93] + - [480, 7478.93] - - [1024, 32, 1, 512] - - [329, 1777.35] + - [424, 1777.35] - - [2944, 1856, 1, 3328] - - [374, 9153.43] + - [469, 9153.43] - - [2368, 64, 1, 128] - - [313, 1102.3] + - [408, 1102.3] - - [2944, 2944, 1, 128] - - [365, 4591.95] + - [460, 4591.95] - - [4, 128, 1, 3328] - - [395, 119.09] + - [490, 119.09] - - [3584, 5888, 1, 1280] - - [374, 9222.49] + - [469, 9222.49] - - [64, 4, 1, 128] - - [386, 1.03516] + - [481, 1.03516] - - [6784, 1856, 1, 1280] - - [374, 9136.07] + - [469, 9136.07] - - [2944, 5056, 1, 256] - - [380, 8860.13] + - [475, 8860.13] - - [2944, 5888, 1, 1280] - - [373, 9643.63] + - [468, 9643.63] - - [5888, 256, 1, 3328] - - [380, 8799.53] + - [475, 8799.53] - - [1856, 5888, 1, 3328] - - [380, 9457.53] + - [475, 9457.53] - - [3584, 1408, 1, 256] - - [380, 8672.53] + - [475, 8672.53] - - [704, 3584, 1, 3328] - - [380, 8525.3] + - [475, 8525.3] - - [5056, 448, 1, 1280] - - [380, 8843.77] + - [475, 8843.77] - - [3584, 1856, 1, 3328] - - [372, 8881.53] + - [467, 8881.53] - - [64, 1408, 1, 128] - - [301, 747.142] + - [396, 747.142] - - [1408, 704, 1, 1280] - - [374, 8342.93] + - [469, 8342.93] - - [2944, 1024, 1, 256] - - [385, 8079.58] + - [480, 8079.58] - - [1024, 2368, 1, 128] - - [369, 3347.58] + - [464, 3347.58] - - [2368, 4288, 1, 3328] - - [380, 9467.67] + - [475, 9467.67] - - [4, 1408, 1, 256] - - [392, 257.563] + - [487, 257.563] - - [1024, 1408, 1, 1280] - - [380, 8241.84] + - [475, 8241.84] - - [64, 64, 1, 256] - - [340, 190.059] + - [435, 190.059] - - [704, 256, 1, 3328] - - [374, 4519.28] + - [469, 4519.28] - - [6784, 5056, 1, 256] - - [373, 9133.78] + - [468, 9133.78] - - [4, 4288, 1, 3328] - - [324, 670.075] + - [419, 670.075] - - [448, 6784, 1, 128] - - [366, 4481.92] + - [461, 4481.92] - - [4, 704, 1, 3328] - - [396, 523.071] + - [491, 523.071] - - [448, 2944, 1, 256] - - [374, 7022.59] + - [469, 7022.59] - - [2944, 6784, 1, 256] - - [380, 9199.84] + - [475, 9199.84] - - [2368, 2368, 1, 1280] - - [385, 8646.84] + - [480, 8646.84] - - [4, 4, 1, 1280] - - [343, 3.11176] + - [438, 3.11176] - - [1856, 3584, 1, 1280] - - [372, 8805.45] + - [467, 8805.45] - - [64, 2944, 1, 256] - - [346, 2565.76] + - [441, 2565.76] - - [3584, 1408, 1, 1280] - - [385, 9273.12] + - [480, 9273.12] - - [448, 256, 1, 128] - - [301, 941.13] + - [396, 941.13] - - [4288, 448, 1, 128] - - [367, 3215.2] + - [462, 3215.2] - - [5056, 256, 1, 1280] - - [380, 8790.13] + - [475, 8790.13] - - [1856, 1408, 1, 3328] - - [374, 9310.73] + - [469, 9310.73] - - [128, 128, 1, 128] - - [309, 155.215] + - [404, 155.215] - - [1024, 4288, 1, 3328] - - [377, 8528.12] + - [472, 8528.12] - - [448, 2368, 1, 256] - - [381, 5097.34] + - [476, 5097.34] - - [1024, 4, 1, 128] - - [387, 10.3721] + - [482, 10.3721] - - [5056, 448, 1, 256] - - [380, 8236.78] + - [475, 8236.78] - - [2944, 2368, 1, 3328] - - [373, 9331.16] + - [468, 9331.16] - - [704, 128, 1, 3328] - - [348, 5969.3] + - [443, 5969.3] - - [64, 64, 1, 3328] - - [364, 1494.78] + - [459, 1494.78] - - [1024, 1856, 1, 1280] - - [379, 6356.43] + - [474, 6356.43] - - [6784, 1856, 1, 256] - - [380, 9068.63] + - [475, 9068.63] - - [128, 2368, 1, 3328] - - [356, 6714.22] + - [451, 6714.22] - - [1024, 5888, 1, 256] - - [380, 5501.6] + - [475, 5501.6] - - [5056, 128, 1, 1280] - - [336, 6455.64] + - [431, 6455.64] - - [5056, 64, 1, 3328] - - [341, 6703.81] + - [436, 6703.81] - - [128, 704, 1, 128] - - [302, 696.618] + - [397, 696.618] - - [1408, 2368, 1, 256] - - [374, 8667.25] + - [469, 8667.25] - - [1408, 1408, 1, 256] - - [385, 7615.81] + - [480, 7615.81] - - [4, 64, 1, 128] - - [387, 1.08463] + - [482, 1.08463] - - [64, 128, 1, 1280] - - [359, 1379.81] + - [454, 1379.81] - - [2368, 2368, 1, 128] - - [369, 4582.26] + - [464, 4582.26] - - [64, 5888, 1, 128] - - [302, 2086.37] + - [397, 2086.37] - - [5888, 4, 1, 3328] - - [323, 667.514] + - [418, 667.514] - - [6784, 1408, 1, 128] - - [370, 4516.34] + - [465, 4516.34] - - [4288, 5888, 1, 256] - - [385, 8497.43] + - [480, 8497.43] - - [1408, 5056, 1, 256] - - [374, 8867.46] + - [469, 8867.46] - - [5056, 128, 1, 3328] - - [356, 7678.98] + - [451, 7678.98] - - [128, 128, 1, 1280] - - [344, 2016.59] + - [439, 2016.59] - - [448, 704, 1, 256] - - [375, 3030.89] + - [470, 3030.89] - - [4288, 3584, 1, 128] - - [366, 5246.33] + - [461, 5246.33] - - [2944, 128, 1, 3328] - - [341, 6795.16] + - [436, 6795.16] - - [128, 5056, 1, 1280] - - [327, 6193.09] + - [422, 6193.09] - - [3584, 5056, 1, 1280] - - [379, 9499.17] + - [474, 9499.17] - - [256, 448, 1, 1280] - - [335, 4267.56] + - [430, 4267.56] - - [704, 704, 1, 128] - - [369, 2259.32] + - [464, 2259.32] - - [5056, 4, 1, 128] - - [387, 12.5313] + - [482, 12.5313] - - [704, 256, 1, 1280] - - [374, 4355.97] + - [469, 4355.97] - - [64, 2368, 1, 3328] - - [348, 6310.97] + - [443, 6310.97] - - [1856, 1024, 1, 128] - - [365, 4065.43] + - [460, 4065.43] - - [1856, 64, 1, 128] - - [304, 936.329] + - [399, 936.329] - - [64, 6784, 1, 1280] - - [327, 5731.8] + - [422, 5731.8] - - [704, 4288, 1, 256] - - [380, 5218.9] + - [475, 5218.9] - - [5888, 2368, 1, 1280] - - [374, 9378.9] + - [469, 9378.9] - - [128, 256, 1, 256] - - [344, 1219.37] + - [439, 1219.37] - - [256, 64, 1, 1280] - - [346, 1820.54] + - [441, 1820.54] - - [2368, 5888, 1, 1280] - - [385, 9143.64] + - [480, 9143.64] - - [5888, 256, 1, 1280] - - [374, 8678.47] + - [469, 8678.47] - - [4, 5888, 1, 1280] - - [321, 668.242] + - [416, 668.242] - - [704, 128, 1, 128] - - [309, 649.556] + - [404, 649.556] - - [1024, 4, 1, 1280] - - [340, 478.465] + - [435, 478.465] - - [2368, 1856, 1, 3328] - - [372, 8153.87] + - [467, 8153.87] - - [2368, 128, 1, 128] - - [307, 1858.21] + - [402, 1858.21] - - [2944, 704, 1, 256] - - [374, 8438.07] + - [469, 8438.07] - - [5056, 128, 1, 128] - - [303, 2689.63] + - [398, 2689.63] - - [256, 704, 1, 3328] - - [374, 4541.18] + - [469, 4541.18] - - [704, 3584, 1, 256] - - [375, 7771.07] + - [470, 7771.07] - - [1024, 1024, 1, 1024] - - [380, 8305.62] + - [475, 8305.62] - - [704, 2944, 1, 3328] - - [380, 9166.48] + - [475, 9166.48] - - [6784, 1024, 1, 128] - - [365, 4362.31] + - [460, 4362.31] - - [256, 448, 1, 128] - - [312, 899.614] + - [407, 899.614] - - [448, 1024, 1, 3328] - - [374, 7385.56] + - [469, 7385.56] - - [2944, 1024, 1, 3328] - - [377, 8779.81] + - [472, 8779.81] - - [2944, 5056, 1, 128] - - [369, 5103.11] + - [464, 5103.11] - - [1408, 6784, 1, 256] - - [380, 8346.89] + - [475, 8346.89] - - [6784, 1408, 1, 3328] - - [376, 8878.4] + - [471, 8878.4] - - [4288, 6784, 1, 128] - - [365, 5432.99] + - [460, 5432.99] - - [704, 64, 1, 256] - - [354, 1441.89] + - [449, 1441.89] - - [5888, 4, 1, 1280] - - [391, 636.641] + - [486, 636.641] - - [256, 2368, 1, 3328] - - [374, 6804.8] + - [469, 6804.8] - - [6784, 2944, 1, 1280] - - [373, 9472.26] + - [468, 9472.26] - - [4288, 1856, 1, 128] - - [369, 4886.38] + - [464, 4886.38] - - [1856, 2944, 1, 128] - - [366, 4642.96] + - [461, 4642.96] - - [6784, 448, 1, 128] - - [366, 4369.17] + - [461, 4369.17] - - [64, 3584, 1, 128] - - [313, 1645.85] + - [408, 1645.85] - - [448, 5056, 1, 1280] - - [374, 8553.64] + - [469, 8553.64] - - [2368, 1856, 1, 128] - - [366, 4741.85] + - [461, 4741.85] - - [128, 448, 1, 1280] - - [356, 3745.01] + - [451, 3745.01] - - [4288, 704, 1, 256] - - [374, 8444.16] + - [469, 8444.16] - - [256, 3584, 1, 128] - - [366, 2454.96] + - [461, 2454.96] - - [5888, 704, 1, 256] - - [374, 8819.57] + - [469, 8819.57] - - [3584, 1024, 1, 128] - - [369, 4094.96] + - [464, 4094.96] - - [256, 5888, 1, 3328] - - [383, 8538.33] + - [478, 8538.33] - - [1408, 4288, 1, 3328] - - [385, 9212.57] + - [480, 9212.57] - - [6784, 4288, 1, 256] - - [373, 9163.12] + - [468, 9163.12] - - [4288, 256, 1, 128] - - [366, 3081.44] + - [461, 3081.44] - - [5888, 256, 1, 256] - - [374, 7680.75] + - [469, 7680.75] - - [6784, 1024, 1, 1280] - - [385, 9248.63] + - [480, 9248.63] - - [5888, 1024, 1, 128] - - [369, 4061.94] + - [464, 4061.94] - - [1024, 128, 1, 256] - - [380, 2317.39] + - [475, 2317.39] - - [128, 64, 1, 3328] - - [363, 2116.79] + - [458, 2116.79] - - [448, 64, 1, 256] - - [346, 1079.52] + - [441, 1079.52] - - [2368, 256, 1, 128] - - [367, 2229.83] + - [462, 2229.83] - - [6784, 3584, 1, 1280] - - [380, 9096.6] + - [475, 9096.6] - - [1024, 6784, 1, 1280] - - [378, 9112.9] + - [473, 9112.9] - - [2944, 64, 1, 1280] - - [336, 4983.0] + - [431, 4983.0] - - [1408, 2944, 1, 1280] - - [375, 9131.63] + - [470, 9131.63] - - [256, 1856, 1, 256] - - [383, 4432.86] + - [478, 4432.86] - - [1408, 2368, 1, 3328] - - [383, 8449.18] + - [478, 8449.18] - - [2944, 4, 1, 3328] - - [329, 673.94] + - [424, 673.94] - - [128, 1408, 1, 3328] - - [348, 6582.47] + - [443, 6582.47] - - [2944, 1856, 1, 128] - - [366, 4827.54] + - [461, 4827.54] - - [256, 2944, 1, 128] - - [369, 2416.66] + - [464, 2416.66] - - [256, 6784, 1, 128] - - [369, 3118.76] + - [464, 3118.76] - - [2368, 4, 1, 128] - - [387, 22.7197] + - [482, 22.7197] - - [1408, 256, 1, 3328] - - [374, 3733.82] + - [469, 3733.82] - - [1856, 4, 1, 128] - - [386, 7.20009] + - [481, 7.20009] - - [1024, 16, 1, 512] - - [342, 1165.18] + - [437, 1165.18] - - [5056, 6784, 1, 128] - - [370, 4949.13] + - [465, 4949.13] - - [4288, 5056, 1, 128] - - [369, 4966.9] + - [464, 4966.9] - - [1856, 5888, 1, 128] - - [365, 4351.76] + - [460, 4351.76] - - [2944, 5888, 1, 256] - - [385, 8460.99] + - [480, 8460.99] - - [3584, 1856, 1, 256] - - [380, 8876.7] + - [475, 8876.7] - - [4288, 3584, 1, 1280] - - [373, 9603.7] + - [468, 9603.7] - - [2368, 448, 1, 256] - - [374, 6604.7] + - [469, 6604.7] - - [4288, 256, 1, 3328] - - [374, 7619.89] + - [469, 7619.89] - - [1856, 704, 1, 128] - - [366, 3629.61] + - [461, 3629.61] - - [1408, 64, 1, 256] - - [330, 2168.21] + - [425, 2168.21] - - [64, 1856, 1, 128] - - [306, 979.762] + - [401, 979.762] - - [4, 256, 1, 128] - - [387, 5.23595] + - [482, 5.23595] - - [704, 4288, 1, 3328] - - [380, 9014.52] + - [475, 9014.52] - - [704, 5888, 1, 128] - - [367, 4221.77] + - [462, 4221.77] - - [6784, 3584, 1, 128] - - [365, 5360.73] + - [460, 5360.73] - - [1024, 64, 1, 256] - - [325, 1588.85] + - [420, 1588.85] - - [64, 2368, 1, 256] - - [380, 2552.55] + - [475, 2552.55] - - [4288, 5056, 1, 3328] - - [379, 8193.38] + - [474, 8193.38] - - [4, 1856, 1, 1280] - - [329, 499.192] + - [424, 499.192] - - [4288, 128, 1, 128] - - [366, 2373.57] + - [461, 2373.57] - - [1408, 1408, 1, 128] - - [369, 3753.88] + - [464, 3753.88] - - [1024, 128, 1, 3328] - - [351, 5656.32] + - [446, 5656.32] - - [1856, 128, 1, 128] - - [302, 1617.58] + - [397, 1617.58] - - [5056, 2368, 1, 256] - - [385, 5553.41] + - [480, 5553.41] - - [4288, 704, 1, 3328] - - [373, 6962.06] + - [468, 6962.06] - - [448, 3584, 1, 256] - - [383, 5981.5] + - [478, 5981.5] - - [64, 128, 1, 128] - - [320, 74.9983] + - [415, 74.9983] - - [2368, 64, 1, 1280] - - [356, 5041.33] + - [451, 5041.33] - - [2368, 1024, 1, 1280] - - [381, 7740.97] + - [476, 7740.97] - - [2944, 1408, 1, 3328] - - [383, 9204.65] + - [478, 9204.65] - - [1408, 448, 1, 256] - - [380, 5954.4] + - [475, 5954.4] - - [1024, 1408, 1, 3328] - - [377, 8161.54] + - [472, 8161.54] - - [2560, 7133, 1, 2560] - - [372, 9636.69] + - [467, 9636.69] - - [1408, 4, 1, 1280] - - [324, 520.979] + - [419, 520.979] - - [5888, 3584, 1, 256] - - [385, 9225.26] + - [480, 9225.26] - - [128, 1024, 1, 1280] - - [327, 4755.55] + - [422, 4755.55] - - [1408, 1856, 1, 3328] - - [377, 9130.87] + - [472, 9130.87] - - [4, 4, 1, 3328] - - [397, 7.03333] + - [492, 7.03333] - - [6784, 1408, 1, 1280] - - [374, 9346.91] + - [469, 9346.91] - - [4, 1024, 1, 1280] - - [324, 422.913] + - [419, 422.913] - - [704, 2944, 1, 256] - - [380, 8332.06] + - [475, 8332.06] - - [704, 4288, 1, 128] - - [366, 4371.14] + - [461, 4371.14] - - [2368, 4288, 1, 128] - - [366, 3988.89] + - [461, 3988.89] - - [64, 4288, 1, 1280] - - [356, 5407.63] + - [451, 5407.63] - - [6784, 64, 1, 1280] - - [336, 5708.25] + - [431, 5708.25] - - [3584, 128, 1, 128] - - [302, 2463.2] + - [397, 2463.2] - - [1024, 6784, 1, 128] - - [367, 3862.12] + - [462, 3862.12] - - [4, 1856, 1, 128] - - [387, 30.6362] + - [482, 30.6362] - - [1408, 64, 1, 3328] - - [356, 6095.48] + - [451, 6095.48] - - [6784, 4, 1, 256] - - [389, 487.938] + - [484, 487.938] - - [1408, 1408, 1, 1280] - - [385, 8640.63] + - [480, 8640.63] - - [256, 2368, 1, 256] - - [377, 4282.36] + - [472, 4282.36] - - [448, 4288, 1, 3328] - - [374, 8516.13] + - [469, 8516.13] - - [2368, 1408, 1, 256] - - [380, 8632.19] + - [475, 8632.19] - - [5888, 5056, 1, 128] - - [366, 5091.11] + - [461, 5091.11] - - [704, 2368, 1, 256] - - [380, 7664.8] + - [475, 7664.8] - - [2944, 448, 1, 1280] - - [380, 7618.35] + - [475, 7618.35] - - [5888, 2368, 1, 3328] - - [383, 9343.48] + - [478, 9343.48] - - [64, 2944, 1, 1280] - - [348, 5162.18] + - [443, 5162.18] - - [448, 1856, 1, 1280] - - [374, 7028.0] + - [469, 7028.0] - - [4288, 448, 1, 1280] - - [374, 5855.76] + - [469, 5855.76] - - [5888, 704, 1, 3328] - - [383, 9190.91] + - [478, 9190.91] - - [5056, 256, 1, 128] - - [369, 3235.94] + - [464, 3235.94] - - [1856, 256, 1, 128] - - [367, 1849.78] + - [462, 1849.78] - - [5056, 128, 1, 256] - - [380, 6109.06] + - [475, 6109.06] - - [704, 4, 1, 256] - - [340, 125.256] + - [435, 125.256] - - [1408, 5888, 1, 128] - - [366, 5055.16] + - [461, 5055.16] - - [4288, 4, 1, 128] - - [386, 95.7209] + - [481, 95.7209] - - [1408, 1024, 1, 256] - - [374, 7370.28] + - [469, 7370.28] - - [1024, 1856, 1, 128] - - [366, 2966.8] + - [461, 2966.8] - - [256, 704, 1, 128] - - [368, 528.229] + - [463, 528.229] - - [256, 1024, 1, 128] - - [366, 1171.69] + - [461, 1171.69] - - [448, 1024, 1, 256] - - [380, 5624.65] + - [475, 5624.65] - - [128, 4, 1, 3328] - - [397, 191.985] + - [492, 191.985] - - [5056, 6784, 1, 1280] - - [374, 9544.07] + - [469, 9544.07] - - [704, 5056, 1, 3328] - - [381, 8790.35] + - [476, 8790.35] - - [64, 1408, 1, 1280] - - [348, 4505.7] + - [443, 4505.7] - - [3584, 5056, 1, 3328] - - [379, 9073.52] + - [474, 9073.52] - - [1856, 4, 1, 3328] - - [397, 612.875] + - [492, 612.875] - - [4, 2944, 1, 128] - - [386, 72.0145] + - [481, 72.0145] - - [2368, 2944, 1, 3328] - - [372, 9314.68] + - [467, 9314.68] - - [448, 448, 1, 1280] - - [356, 5129.91] + - [451, 5129.91] - - [2368, 3584, 1, 256] - - [374, 8998.8] + - [469, 8998.8] - - [5056, 3584, 1, 1280] - - [375, 9345.17] + - [470, 9345.17] - - [448, 4, 1, 3328] - - [397, 487.337] + - [492, 487.337] - - [1856, 2944, 1, 1280] - - [385, 8438.79] + - [480, 8438.79] - - [3584, 2368, 1, 1280] - - [380, 9298.9] + - [475, 9298.9] - - [128, 1024, 1, 256] - - [332, 2356.45] + - [427, 2356.45] - - [2944, 1408, 1, 256] - - [372, 5440.82] + - [467, 5440.82] - - [4288, 1408, 1, 3328] - - [372, 9386.09] + - [467, 9386.09] - - [3584, 64, 1, 3328] - - [328, 6310.97] + - [423, 6310.97] - - [1408, 128, 1, 256] - - [374, 2942.53] + - [469, 2942.53] - - [2944, 1024, 1, 128] - - [369, 3927.99] + - [464, 3927.99] - - [4288, 5056, 1, 1280] - - [376, 8328.58] + - [471, 8328.58] - - [5888, 6784, 1, 1280] - - [385, 9757.44] + - [480, 9757.44] - - [6784, 5056, 1, 128] - - [365, 5101.4] + - [460, 5101.4] - - [256, 1024, 1, 3328] - - [374, 6475.87] + - [469, 6475.87] - - [3584, 4, 1, 256] - - [390, 420.973] + - [485, 420.973] - - [1856, 64, 1, 3328] - - [356, 6409.2] + - [451, 6409.2] - - [64, 6784, 1, 128] - - [304, 2387.32] + - [399, 2387.32] - - [5888, 1408, 1, 3328] - - [379, 9655.89] + - [474, 9655.89] - - [5888, 64, 1, 1280] - - [374, 5870.86] + - [469, 5870.86] - - [256, 5056, 1, 256] - - [377, 6109.06] + - [472, 6109.06] - - [128, 3584, 1, 128] - - [307, 2383.23] + - [402, 2383.23] - - [448, 3584, 1, 3328] - - [372, 7092.28] + - [467, 7092.28] - - [704, 2368, 1, 128] - - [366, 3741.08] + - [461, 3741.08] - - [5888, 256, 1, 128] - - [367, 2977.54] + - [462, 2977.54] - - [4, 5056, 1, 128] - - [386, 132.72] + - [481, 132.72] - - [448, 256, 1, 256] - - [338, 2308.29] + - [433, 2308.29] - - [704, 4, 1, 3328] - - [343, 552.674] + - [438, 552.674] - - [1408, 256, 1, 256] - - [374, 4577.22] + - [469, 4577.22] - - [3584, 1856, 1, 128] - - [366, 4571.86] + - [461, 4571.86] - - [4288, 4288, 1, 128] - - [369, 5284.65] + - [464, 5284.65] - - [1856, 1024, 1, 3328] - - [380, 6362.25] + - [475, 6362.25] - - [128, 5888, 1, 3328] - - [350, 7040.83] + - [445, 7040.83] - - [1024, 5056, 1, 256] - - [385, 7855.7] + - [480, 7855.7] - - [2368, 1408, 1, 3328] - - [380, 9205.66] + - [475, 9205.66] - - [5888, 448, 1, 256] - - [377, 5538.84] + - [472, 5538.84] - - [5888, 6784, 1, 128] - - [365, 4500.85] + - [460, 4500.85] - - [2368, 4, 1, 3328] - - [343, 642.898] + - [438, 642.898] - - [6784, 5056, 1, 1280] - - [381, 9249.23] + - [476, 9249.23] - - [5056, 704, 1, 1280] - - [380, 8883.37] + - [475, 8883.37] - - [1408, 256, 1, 1280] - - [374, 5632.1] + - [469, 5632.1] - - [4288, 6784, 1, 1280] - - [380, 8843.31] + - [475, 8843.31] - - [128, 704, 1, 256] - - [338, 2045.19] + - [433, 2045.19] - - [448, 128, 1, 1280] - - [348, 3807.17] + - [443, 3807.17] - - [6784, 4, 1, 3328] - - [391, 684.671] + - [486, 684.671] - - [4288, 4, 1, 1280] - - [340, 601.925] + - [435, 601.925] - - [1024, 64, 1, 3328] - - [352, 3928.48] + - [447, 3928.48] - - [1856, 4, 1, 256] - - [390, 293.394] + - [485, 293.394] - - [64, 3584, 1, 1280] - - [374, 5265.55] + - [469, 5265.55] - - [6784, 1408, 1, 256] - - [374, 9059.36] + - [469, 9059.36] - - [3584, 5888, 1, 128] - - [366, 5084.29] + - [461, 5084.29] - - [5056, 5888, 1, 256] - - [385, 8590.09] + - [480, 8590.09] - - [2368, 1024, 1, 256] - - [377, 4493.13] + - [472, 4493.13] - - [2944, 1856, 1, 256] - - [383, 5202.41] + - [478, 5202.41] - - [1856, 6784, 1, 1280] - - [381, 9071.48] + - [476, 9071.48] - - [64, 5056, 1, 128] - - [304, 2038.42] + - [399, 2038.42] - - [5888, 64, 1, 128] - - [303, 2016.59] + - [398, 2016.59] - - [448, 704, 1, 128] - - [367, 1173.65] + - [462, 1173.65] - - [4, 1024, 1, 128] - - [386, 8.89685] + - [481, 8.89685] - - [4288, 3584, 1, 256] - - [380, 9080.26] + - [475, 9080.26] - - [1408, 704, 1, 128] - - [366, 3165.71] + - [461, 3165.71] - - [64, 256, 1, 3328] - - [360, 3126.59] + - [455, 3126.59] - - [5056, 1856, 1, 1280] - - [377, 8857.55] + - [472, 8857.55] - - [1408, 1024, 1, 3328] - - [383, 8177.12] + - [478, 8177.12] - - [2368, 256, 1, 3328] - - [374, 6810.31] + - [469, 6810.31] - - [5888, 3584, 1, 1280] - - [372, 9535.55] + - [467, 9535.55] - - [1856, 3584, 1, 3328] - - [374, 9281.91] + - [469, 9281.91] - - [5888, 128, 1, 1280] - - [380, 8136.82] + - [475, 8136.82] - - [1024, 2944, 1, 256] - - [372, 7247.96] + - [467, 7247.96] - - [448, 6784, 1, 1280] - - [380, 7014.04] + - [475, 7014.04] - - [256, 3584, 1, 1280] - - [374, 7738.64] + - [469, 7738.64] - - [448, 128, 1, 128] - - [304, 496.048] + - [399, 496.048] - - [704, 5056, 1, 256] - - [380, 8609.44] + - [475, 8609.44] - - [3584, 1024, 1, 3328] - - [373, 7765.73] + - [468, 7765.73] - - [2944, 1856, 1, 1280] - - [385, 7776.03] + - [480, 7776.03] - - [128, 256, 1, 128] - - [317, 296.308] + - [412, 296.308] - - [5056, 256, 1, 256] - - [374, 7829.73] + - [469, 7829.73] - - [2368, 3584, 1, 3328] - - [373, 8896.08] + - [468, 8896.08] - - [2944, 704, 1, 1280] - - [383, 6855.83] + - [478, 6855.83] - - [128, 4, 1, 256] - - [392, 24.9242] + - [487, 24.9242] - - [2944, 3584, 1, 1280] - - [385, 9049.22] + - [480, 9049.22] - - [1856, 5888, 1, 1280] - - [380, 9432.06] + - [475, 9432.06] - - [256, 256, 1, 1280] - - [345, 3942.12] + - [440, 3942.12] - - [5056, 448, 1, 3328] - - [385, 4587.83] + - [480, 4587.83] - - [4288, 1408, 1, 256] - - [385, 5408.83] + - [480, 5408.83] - - [3584, 64, 1, 256] - - [354, 2496.71] + - [449, 2496.71] - - [64, 1856, 1, 3328] - - [327, 5896.78] + - [422, 5896.78] - - [256, 1408, 1, 128] - - [366, 1643.17] + - [461, 1643.17] - - [5888, 1408, 1, 128] - - [365, 4436.37] + - [460, 4436.37] - - [4288, 2368, 1, 1280] - - [374, 9433.04] + - [469, 9433.04] - - [4, 4288, 1, 256] - - [389, 442.732] + - [484, 442.732] - - [256, 4288, 1, 128] - - [366, 2814.79] + - [461, 2814.79] - - [256, 128, 1, 3328] - - [355, 3951.26] + - [450, 3951.26] - - [6784, 2368, 1, 256] - - [374, 9169.99] + - [469, 9169.99] - - [5888, 128, 1, 128] - - [303, 3156.81] + - [398, 3156.81] - - [4288, 1856, 1, 256] - - [380, 5658.23] + - [475, 5658.23] - - [1856, 256, 1, 3328] - - [374, 7646.37] + - [469, 7646.37] - - [1856, 2944, 1, 256] - - [381, 6444.98] + - [476, 6444.98] - - [5056, 1024, 1, 128] - - [365, 4607.3] + - [460, 4607.3] - - [64, 5888, 1, 1280] - - [380, 5842.46] + - [475, 5842.46] - - [1760, 7133, 1, 1760] - - [373, 9097.84] + - [468, 9097.84] - - [6784, 256, 1, 128] - - [366, 3685.41] + - [461, 3685.41] - - [5888, 704, 1, 128] - - [365, 3656.23] + - [460, 3656.23] - - [6784, 64, 1, 128] - - [316, 2191.52] + - [411, 2191.52] - - [1024, 4288, 1, 1280] - - [380, 9199.32] + - [475, 9199.32] - - [2368, 5056, 1, 3328] - - [376, 9072.88] + - [471, 9072.88] - - [448, 4, 1, 128] - - [387, 5.42937] + - [482, 5.42937] - - [4, 256, 1, 3328] - - [397, 311.037] + - [492, 311.037] - - [4288, 1024, 1, 3328] - - [378, 8660.33] + - [473, 8660.33] - - [1024, 5056, 1, 3328] - - [374, 8886.76] + - [469, 8886.76] - - [1024, 1856, 1, 3328] - - [379, 8426.24] + - [474, 8426.24] - - [704, 704, 1, 1280] - - [374, 7661.8] + - [469, 7661.8] - - [128, 2368, 1, 1280] - - [348, 5746.15] + - [443, 5746.15] - - [1408, 128, 1, 3328] - - [356, 6530.87] + - [451, 6530.87] - - [3584, 256, 1, 1280] - - [380, 7634.04] + - [475, 7634.04] - - [4, 128, 1, 128] - - [387, 2.07874] + - [482, 2.07874] - - [704, 6784, 1, 128] - - [369, 4589.59] + - [464, 4589.59] - - [3584, 128, 1, 1280] - - [374, 7078.24] + - [469, 7078.24] - - [4, 256, 1, 1280] - - [343, 178.187] + - [438, 178.187] - - [128, 704, 1, 3328] - - [348, 5959.81] + - [443, 5959.81] - - [4288, 6784, 1, 256] - - [374, 9326.54] + - [469, 9326.54] - - [3584, 2944, 1, 3328] - - [376, 9114.16] + - [471, 9114.16] - - [128, 1856, 1, 256] - - [380, 3672.65] + - [475, 3672.65] - - [64, 4288, 1, 256] - - [374, 3457.51] + - [469, 3457.51] - - [4, 3584, 1, 3328] - - [323, 694.37] + - [418, 694.37] - - [64, 4, 1, 3328] - - [343, 71.5738] + - [438, 71.5738] - - [4, 64, 1, 3328] - - [343, 91.9069] + - [438, 91.9069] - - [5888, 2944, 1, 256] - - [373, 7241.55] + - [468, 7241.55] - - [2368, 6784, 1, 128] - - [369, 5229.63] + - [464, 5229.63] - - [448, 4288, 1, 1280] - - [374, 8416.4] + - [469, 8416.4] - - [448, 1856, 1, 3328] - - [374, 7161.56] + - [469, 7161.56] - - [4, 1024, 1, 256] - - [340, 187.346] + - [435, 187.346] - - [5056, 4288, 1, 256] - - [385, 8947.26] + - [480, 8947.26] - - [1024, 448, 1, 256] - - [380, 5318.96] + - [475, 5318.96] - - [1024, 3584, 1, 256] - - [375, 6152.04] + - [470, 6152.04] - - [2944, 128, 1, 1280] - - [356, 6053.63] + - [451, 6053.63] - - [1856, 5056, 1, 128] - - [366, 5091.42] + - [461, 5091.42] - - [64, 256, 1, 256] - - [329, 771.112] + - [424, 771.112] - - [1408, 4, 1, 128] - - [386, 40.8758] + - [481, 40.8758] - - [128, 2368, 1, 128] - - [314, 1520.37] + - [409, 1520.37] - - [256, 704, 1, 1280] - - [374, 4329.81] + - [469, 4329.81] - - [64, 2368, 1, 128] - - [305, 1212.52] + - [400, 1212.52] - - [6784, 6784, 1, 3328] - - [385, 8310.67] + - [480, 8310.67] - - [448, 5888, 1, 1280] - - [380, 8502.33] + - [475, 8502.33] - - [5056, 448, 1, 128] - - [366, 4161.0] + - [461, 4161.0] - - [3584, 2944, 1, 128] - - [366, 4363.51] + - [461, 4363.51] - - [6784, 256, 1, 1280] - - [380, 8629.67] + - [475, 8629.67] - - [256, 2944, 1, 1280] - - [380, 7277.48] + - [475, 7277.48] - - [64, 4288, 1, 128] - - [305, 1822.06] + - [400, 1822.06] - - [2368, 5888, 1, 3328] - - [374, 9017.52] + - [469, 9017.52] - - [4, 64, 1, 256] - - [340, 16.1627] + - [435, 16.1627] - - [704, 1024, 1, 3328] - - [380, 8059.55] + - [475, 8059.55] - - [2368, 1856, 1, 1280] - - [380, 8813.24] + - [475, 8813.24] - - [128, 448, 1, 128] - - [301, 588.244] + - [396, 588.244] - - [128, 6784, 1, 256] - - [380, 6538.28] + - [475, 6538.28] - - [3584, 4288, 1, 128] - - [366, 5025.46] + - [461, 5025.46] - - [64, 448, 1, 128] - - [318, 231.793] + - [413, 231.793] - - [5888, 4288, 1, 3328] - - [374, 9515.88] + - [469, 9515.88] - - [2368, 704, 1, 256] - - [380, 7642.84] + - [475, 7642.84] - - [256, 1856, 1, 3328] - - [380, 6547.17] + - [475, 6547.17] - - [1856, 128, 1, 256] - - [374, 3782.28] + - [469, 3782.28] - - [6784, 128, 1, 128] - - [308, 2835.54] + - [403, 2835.54] - - [3584, 1408, 1, 128] - - [365, 3049.21] + - [460, 3049.21] - - [1856, 5056, 1, 1280] - - [381, 8863.3] + - [476, 8863.3] - - [2944, 1024, 1, 1280] - - [385, 8873.25] + - [480, 8873.25] - - [5056, 4, 1, 256] - - [321, 494.121] + - [416, 494.121] - - [3584, 5888, 1, 3328] - - [373, 9585.25] + - [468, 9585.25] - - [2368, 4288, 1, 256] - - [385, 6419.05] + - [480, 6419.05] - - [1024, 2368, 1, 3328] - - [380, 8645.36] + - [475, 8645.36] - - [64, 704, 1, 3328] - - [362, 4399.93] + - [457, 4399.93] - - [704, 1408, 1, 256] - - [374, 7428.54] + - [469, 7428.54] - - [6784, 1856, 1, 3328] - - [385, 9163.66] + - [480, 9163.66] - - [1024, 2944, 1, 128] - - [369, 3551.98] + - [464, 3551.98] - - [1024, 3584, 1, 1280] - - [383, 9112.47] + - [478, 9112.47] - - [4288, 5888, 1, 3328] - - [373, 8524.05] + - [468, 8524.05] - - [4288, 4, 1, 3328] - - [340, 620.016] + - [435, 620.016] - - [256, 1408, 1, 256] - - [374, 4505.7] + - [469, 4505.7] - - [448, 2944, 1, 1280] - - [374, 7612.87] + - [469, 7612.87] - - [4, 5888, 1, 128] - - [386, 174.564] + - [481, 174.564] - - [1024, 2944, 1, 3328] - - [379, 9136.74] + - [474, 9136.74] - - [3584, 6784, 1, 256] - - [379, 7253.89] + - [474, 7253.89] - - [256, 6784, 1, 1280] - - [374, 8637.72] + - [469, 8637.72] - - [1856, 3584, 1, 256] - - [380, 8199.67] + - [475, 8199.67] - - [128, 448, 1, 3328] - - [361, 4799.92] + - [456, 4799.92] - - [6784, 1856, 1, 128] - - [366, 5185.62] + - [461, 5185.62] - - [4, 448, 1, 256] - - [340, 86.9848] + - [435, 86.9848] - - [2944, 704, 1, 128] - - [369, 3798.64] + - [464, 3798.64] - - [256, 5888, 1, 1280] - - [374, 8678.47] + - [469, 8678.47] - - [4, 128, 1, 1280] - - [343, 102.5] + - [438, 102.5] - - [4288, 6784, 1, 3328] - - [379, 8209.4] + - [474, 8209.4] - - [6784, 128, 1, 1280] - - [356, 6562.99] + - [451, 6562.99] - - [64, 1408, 1, 256] - - [346, 2059.8] + - [441, 2059.8] - - [7680, 5481, 1, 2560] - - [385, 9426.79] + - [480, 9426.79] - - [2368, 1408, 1, 128] - - [366, 4532.5] + - [461, 4532.5] - - [1856, 448, 1, 256] - - [374, 6275.48] + - [469, 6275.48] - - [1408, 1024, 1, 128] - - [366, 3604.58] + - [461, 3604.58] - - [128, 64, 1, 128] - - [301, 87.4813] + - [396, 87.4813] - - [6784, 3584, 1, 3328] - - [381, 8991.92] + - [476, 8991.92] - - [2944, 64, 1, 3328] - - [350, 6043.36] + - [445, 6043.36] - - [64, 64, 1, 128] - - [306, 36.309] + - [401, 36.309] - - [2368, 5056, 1, 1280] - - [380, 9438.48] + - [475, 9438.48] - - [64, 4, 1, 1280] - - [343, 40.2569] + - [438, 40.2569] - - [1408, 2368, 1, 1280] - - [376, 7738.16] + - [471, 7738.16] - - [128, 1408, 1, 1280] - - [348, 4937.74] + - [443, 4937.74] - - [256, 64, 1, 3328] - - [358, 2683.46] + - [453, 2683.46] - - [2944, 4288, 1, 128] - - [366, 5173.81] + - [461, 5173.81] - - [2944, 2944, 1, 256] - - [374, 8943.92] + - [469, 8943.92] - - [2944, 4, 1, 1280] - - [323, 617.857] + - [418, 617.857] - - [5888, 4, 1, 256] - - [389, 483.218] + - [484, 483.218] - - [6784, 256, 1, 256] - - [380, 7916.7] + - [475, 7916.7] - - [256, 5056, 1, 3328] - - [374, 8953.25] + - [469, 8953.25] - - [128, 4288, 1, 1280] - - [327, 6015.05] + - [422, 6015.05] - - [5056, 1856, 1, 128] - - [368, 4221.15] + - [463, 4221.15] - - [5888, 1408, 1, 256] - - [379, 9144.85] + - [474, 9144.85] - - [128, 128, 1, 256] - - [329, 759.938] + - [424, 759.938] - - [5056, 4, 1, 3328] - - [389, 642.818] + - [484, 642.818] - - [4288, 3584, 1, 3328] - - [375, 9300.05] + - [470, 9300.05] - - [448, 704, 1, 3328] - - [381, 4481.08] + - [476, 4481.08] - - [448, 448, 1, 128] - - [305, 1360.81] + - [400, 1360.81] - - [1024, 2368, 1, 1280] - - [374, 8570.29] + - [469, 8570.29] - - [1856, 704, 1, 3328] - - [374, 8448.26] + - [469, 8448.26] - - [4, 2368, 1, 128] - - [386, 64.5902] + - [481, 64.5902] - - [5888, 6784, 1, 3328] - - [381, 9447.12] + - [476, 9447.12] - - [704, 4288, 1, 1280] - - [383, 7476.87] + - [478, 7476.87] - - [704, 256, 1, 256] - - [374, 2957.62] + - [469, 2957.62] - - [6784, 448, 1, 3328] - - [377, 8886.22] + - [472, 8886.22] - - [4288, 1024, 1, 128] - - [365, 3864.49] + - [460, 3864.49] - - [49, 512, 128, 2048] - - [408, 7112.78] + - [503, 7112.78] - - [196, 256, 256, 1024] - - [402, 8302.7] + - [497, 8302.7] - - [784, 512, 256, 128] - - [400, 9061.36] + - [495, 9061.36] - - [49, 2048, 128, 512] - - [398, 6963.36] - - - [784, 512, 64, 128] - - [400, 8822.62] + - [493, 6963.36] - - [784, 128, 128, 512] - - [407, 8983.63] + - [502, 8983.63] - - [196, 256, 64, 1024] - - [406, 7823.5] + - [501, 7823.5] - - [3136, 256, 256, 64] - - [403, 9051.38] + - [498, 9051.38] - - [3136, 64, 128, 64] - - [399, 8581.35] + - [494, 8581.35] - - [49, 2048, 256, 512] - - [398, 7049.64] - - - [196, 1024, 64, 256] - - [401, 7953.69] + - [493, 7049.64] - - [784, 128, 256, 512] - - [409, 9102.99] + - [504, 9102.99] - - [196, 256, 128, 1024] - - [401, 8085.89] - - - [3136, 64, 64, 256] - - [405, 9266.13] - - - [784, 128, 64, 512] - - [406, 8809.39] - - - [49, 2048, 64, 512] - - [398, 6843.95] + - [496, 8085.89] - - [3136, 64, 128, 256] - - [405, 9381.39] + - [500, 9381.39] - - [3136, 256, 128, 64] - - [403, 8982.64] + - [498, 8982.64] - - [784, 512, 128, 128] - - [400, 8965.99] - - - [3136, 256, 64, 64] - - [403, 8879.8] + - [495, 8965.99] - - [3136, 64, 256, 256] - - [405, 9566.43] - - - [3136, 64, 64, 64] - - [404, 8314.05] + - [500, 9566.43] - - [3136, 64, 256, 64] - - [399, 8743.8] + - [494, 8743.8] - - [196, 1024, 128, 256] - - [402, 8119.43] - - - [49, 512, 64, 2048] - - [410, 7055.41] + - [497, 8119.43] - - [49, 512, 256, 2048] - - [411, 7166.41] + - [506, 7166.41] - - [196, 1024, 256, 256] - - [402, 8210.66] + - [497, 8210.66] - - [5329, 160, 64, 64] - - [418, 8156.89] + - [513, 8156.89] - - [1225, 288, 64, 48] - - [422, 6926.23] + - [517, 6926.23] - - [1225, 192, 64, 64] - - [424, 7840.1] + - [519, 7840.1] - - [64, 1280, 64, 384] - - [425, 9276.11] + - [520, 9276.11] - - [1225, 384, 64, 192] - - [415, 9162.35] + - [510, 9162.35] - - [1225, 288, 64, 64] - - [416, 7495.27] + - [511, 7495.27] - - [5329, 64, 64, 80] - - [417, 8480.13] + - [512, 8480.13] - - [289, 1024, 64, 256] - - [415, 8483.83] + - [510, 8483.83] - - [289, 768, 64, 192] - - [421, 8234.84] + - [516, 8234.84] - - [289, 768, 64, 128] - - [421, 7988.81] + - [516, 7988.81] - - [64, 1536, 64, 384] - - [425, 9323.65] + - [520, 9323.65] - - [1225, 384, 64, 64] - - [424, 8158.8] + - [519, 8158.8] - - [64, 2048, 64, 192] - - [421, 8818.61] + - [516, 8818.61] - - [64, 1280, 64, 320] - - [417, 9202.17] + - [512, 9202.17] - - [1225, 384, 64, 96] - - [415, 8540.7] + - [510, 8540.7] - - [64, 1280, 64, 448] - - [421, 9317.82] + - [516, 9317.82] - - [289, 768, 64, 160] - - [425, 8128.81] + - [520, 8128.81] - - [1225, 192, 64, 32] - - [424, 6495.37] + - [519, 6495.37] - - [64, 1536, 64, 256] - - [421, 9143.0] + - [516, 9143.0] - - [1225, 256, 64, 48] - - [419, 7545.36] + - [514, 7545.36] - - [1225, 256, 64, 64] - - [420, 7972.45] + - [515, 7972.45] - - [1225, 192, 64, 48] - - [423, 7348.9] + - [518, 7348.9] - - [289, 1024, 64, 384] - - [413, 8725.66] + - [508, 8725.66] - - [289, 1024, 64, 192] - - [415, 8313.16] + - [510, 8313.16] - - [64, 1280, 64, 192] - - [417, 8768.68] + - [512, 8768.68] - - [64, 2048, 64, 320] - - [414, 9147.98] + - [509, 9147.98] - - [64, 2048, 64, 448] - - [412, 9304.16] + - [507, 9304.16] - - [64, 2048, 64, 384] - - [414, 9235.28] + - [509, 9235.28] - - [289, 1024, 64, 128] - - [421, 7989.51] + - [516, 7989.51] - - [4096, 1024, 1, 2984] - - [460, 9846.39] + - [555, 9846.39] - - [1024, 4096, 1, 3437] - - [461, 9915.8] + - [556, 9915.8] - - [1024, 4096, 1, 3235] - - [454, 9914.02] + - [549, 9914.02] - - [4096, 1024, 1, 4032] - - [460, 9926.06] + - [555, 9926.06] - - [1024, 4096, 1, 3334] - - [461, 9918.27] + - [556, 9918.27] - - [4096, 1024, 1, 3288] - - [461, 9854.67] + - [556, 9854.67] - - [1024, 4096, 1, 3515] - - [461, 9924.03] + - [556, 9924.03] - - [4096, 1024, 1, 3437] - - [461, 9869.63] + - [556, 9869.63] - - [1024, 4096, 1, 3259] - - [461, 9907.65] + - [556, 9907.65] - - [1024, 4096, 1, 3384] - - [453, 9921.21] + - [548, 9921.21] - - [64, 92, 688, 92] - - [431, 6137.89] + - [526, 6137.89] - - [4096, 1024, 1, 3458] - - [460, 9887.69] + - [555, 9887.69] - - [1024, 4096, 1, 3412] - - [460, 9930.56] + - [555, 9930.56] - - [1024, 4096, 1, 3529] - - [454, 9924.54] + - [549, 9924.54] - - [1024, 4096, 1, 4032] - - [461, 9963.48] + - [556, 9963.48] - - [4096, 1024, 1, 3999] - - [461, 9895.0] + - [556, 9895.0] - - [1024, 4096, 1, 3079] - - [454, 9894.58] + - [549, 9894.58] - - [1024, 4096, 1, 3876] - - [453, 9949.39] + - [548, 9949.39] - - [1024, 4096, 1, 3450] - - [461, 9915.65] + - [556, 9915.65] - - [1024, 4096, 1, 3256] - - [461, 9911.18] + - [556, 9911.18] - - [4096, 1024, 1, 3403] - - [460, 9858.93] + - [555, 9858.93] - - [1024, 1024, 1, 3975] - - [451, 8990.81] + - [546, 8990.81] - - [1024, 4096, 1, 3359] - - [461, 9915.0] + - [556, 9915.0] - - [4096, 1024, 1, 3549] - - [460, 9870.66] + - [555, 9870.66] - - [4096, 1024, 1, 3176] - - [461, 9855.92] + - [556, 9855.92] - - [1024, 4096, 1, 3504] - - [453, 9934.17] + - [548, 9934.17] - - [4096, 1024, 1, 3314] - - [460, 9873.9] + - [555, 9873.9] - - [4096, 1024, 1, 3183] - - [460, 9843.84] + - [555, 9843.84] - - [1024, 4096, 1, 3209] - - [454, 9904.97] + - [549, 9904.97] - - [1024, 4096, 1, 3720] - - [453, 9934.16] + - [548, 9934.16] - - [1024, 4096, 1, 3859] - - [453, 9952.53] + - [548, 9952.53] - - [1024, 33708, 1, 4059] - - [453, 10321.5] + - [548, 10321.5] - - [1024, 4096, 1, 3968] - - [453, 9955.96] + - [548, 9955.96] - - [64, 123, 528, 123] - - [426, 6916.21] + - [521, 6916.21] - - [4096, 1024, 1, 3477] - - [461, 9872.03] + - [556, 9872.03] - - [4096, 1024, 1, 3233] - - [461, 9862.35] + - [556, 9862.35] - - [4096, 1024, 1, 3409] - - [461, 9876.86] + - [556, 9876.86] - - [4096, 1024, 1, 3564] - - [461, 9870.49] + - [556, 9870.49] - - [64, 102, 624, 100] - - [426, 5773.16] + - [521, 5773.16] - - [4096, 1024, 1, 3190] - - [460, 9850.97] + - [555, 9850.97] - - [64, 112, 576, 111] - - [426, 6517.35] + - [521, 6517.35] - - [1024, 4096, 1, 3288] - - [460, 9911.9] + - [555, 9911.9] - - [4096, 1024, 1, 3451] - - [460, 9859.61] + - [555, 9859.61] - - [1024, 4096, 1, 3348] - - [453, 9915.47] + - [548, 9915.47] - - [64, 102, 624, 102] - - [426, 5783.7] + - [521, 5783.7] - - [1024, 4096, 1, 3465] - - [454, 9913.12] + - [549, 9913.12] - - [1024, 33708, 1, 4032] - - [453, 10340.4] + - [548, 10340.4] - - [1024, 33708, 1, 3840] - - [453, 10341.8] + - [548, 10341.8] - - [4096, 1024, 1, 3391] - - [461, 9861.77] + - [556, 9861.77] - - [1024, 4096, 1, 3530] - - [453, 9920.44] + - [548, 9920.44] - - [4096, 1024, 1, 3209] - - [460, 9847.0] + - [555, 9847.0] - - [1024, 4096, 1, 3457] - - [454, 9917.29] + - [549, 9917.29] - - [1024, 4096, 1, 3386] - - [453, 9917.65] + - [548, 9917.65] - - [4096, 1024, 1, 3350] - - [460, 9884.54] + - [555, 9884.54] - - [1024, 4096, 1, 3184] - - [461, 9925.98] + - [556, 9925.98] - - [1024, 4096, 1, 3093] - - [460, 9902.55] + - [555, 9902.55] - - [64, 133, 480, 135] - - [443, 6205.97] + - [538, 6205.97] - - [1024, 4096, 1, 3400] - - [453, 9917.1] + - [548, 9917.1] - - [1024, 1024, 1, 4026] - - [459, 9014.39] + - [554, 9014.39] - - [1024, 4096, 1, 3214] - - [453, 9895.94] + - [548, 9895.94] - - [4096, 1024, 1, 3406] - - [461, 9857.82] + - [556, 9857.82] - - [1024, 4096, 1, 3565] - - [460, 9919.37] + - [555, 9919.37] - - [4096, 1024, 1, 3536] - - [461, 9889.06] + - [556, 9889.06] - - [1024, 4096, 1, 3183] - - [460, 9907.55] + - [555, 9907.55] - - [1024, 4096, 1, 3462] - - [461, 9922.4] + - [556, 9922.4] - - [4096, 1024, 1, 3130] - - [454, 9846.04] + - [549, 9846.04] - - [4096, 1024, 1, 3381] - - [461, 9868.27] + - [556, 9868.27] - - [4096, 1024, 1, 3298] - - [460, 9870.54] + - [555, 9870.54] - - [1024, 4096, 1, 3292] - - [453, 9906.3] + - [548, 9906.3] - - [4096, 1024, 1, 3289] - - [460, 9856.55] + - [555, 9856.55] - - [64, 160, 400, 159] - - [446, 7427.84] + - [541, 7427.84] - - [1024, 4096, 1, 3379] - - [453, 9917.09] + - [548, 9917.09] - - [1024, 4096, 1, 3990] - - [454, 9947.37] + - [549, 9947.37] - - [1024, 4096, 1, 3540] - - [461, 9935.76] + - [556, 9935.76] - - [4096, 1024, 1, 3412] - - [461, 9867.56] + - [556, 9867.56] - - [1024, 1024, 1, 3780] - - [456, 9036.26] + - [551, 9036.26] - - [1024, 4096, 1, 3555] - - [460, 9927.37] + - [555, 9927.37] - - [1024, 4096, 1, 3518] - - [454, 9925.55] + - [549, 9925.55] - - [4096, 1024, 1, 3189] - - [460, 9861.24] + - [555, 9861.24] - - [1024, 4096, 1, 3298] - - [454, 9923.22] + - [549, 9923.22] - - [4096, 1024, 1, 3072] - - [460, 9872.08] + - [555, 9872.08] - - [1024, 4096, 1, 3393] - - [461, 9929.28] + - [556, 9929.28] - - [1024, 4096, 1, 3207] - - [453, 9912.81] + - [548, 9912.81] - - [64, 228, 272, 232] - - [449, 7350.14] + - [544, 7350.14] - - [64, 23, 2720, 23] - - [430, 2640.25] + - [525, 2640.25] - - [4096, 1024, 1, 3487] - - [461, 9860.91] + - [556, 9860.91] - - [1024, 1024, 1, 3822] - - [459, 8993.96] + - [554, 8993.96] - - [64, 77, 816, 77] - - [431, 5273.19] + - [526, 5273.19] - - [4096, 1024, 1, 3431] - - [461, 9867.53] + - [556, 9867.53] - - [4096, 1024, 1, 3378] - - [460, 9888.14] + - [555, 9888.14] - - [4096, 1024, 1, 3529] - - [454, 9879.5] + - [549, 9879.5] - - [4096, 1024, 1, 3460] - - [461, 9877.25] + - [556, 9877.25] - - [1024, 4096, 1, 3336] - - [453, 9912.41] + - [548, 9912.41] - - [1024, 4096, 1, 3501] - - [454, 9914.4] + - [549, 9914.4] - - [64, 159, 400, 159] - - [444, 7016.51] + - [539, 7016.51] - - [1024, 4096, 1, 3584] - - [461, 9940.59] + - [556, 9940.59] - - [64, 135, 480, 134] - - [444, 6241.39] + - [539, 6241.39] - - [64, 99, 624, 99] - - [435, 5617.39] + - [530, 5617.39] - - [4096, 1024, 1, 2499] - - [460, 9813.57] + - [555, 9813.57] - - [1024, 1024, 1, 3942] - - [456, 9060.01] + - [551, 9060.01] - - [4096, 1024, 1, 3352] - - [460, 9867.12] + - [555, 9867.12] - - [1024, 4096, 1, 3543] - - [461, 9928.77] + - [556, 9928.77] - - [1024, 4096, 1, 3476] - - [460, 9931.58] + - [555, 9931.58] - - [1024, 33708, 1, 3822] - - [453, 10324.7] + - [548, 10324.7] - - [1024, 4096, 1, 3436] - - [453, 9917.28] + - [548, 9917.28] - - [1024, 1024, 1, 3861] - - [452, 8998.49] + - [547, 8998.49] - - [1024, 1024, 1, 4000] - - [457, 9058.3] + - [552, 9058.3] - - [1024, 4096, 1, 3594] - - [453, 9927.88] + - [548, 9927.88] - - [4096, 1024, 1, 3514] - - [461, 9872.3] + - [556, 9872.3] - - [1024, 4096, 1, 3064] - - [460, 9907.1] + - [555, 9907.1] - - [4096, 1024, 1, 3371] - - [453, 9857.74] + - [548, 9857.74] - - [4096, 1024, 1, 3558] - - [461, 9876.31] + - [556, 9876.31] - - [4096, 1024, 1, 3517] - - [460, 9866.45] + - [555, 9866.45] - - [4096, 1024, 1, 3144] - - [460, 9846.36] + - [555, 9846.36] - - [1024, 4096, 1, 3312] - - [453, 9932.85] + - [548, 9932.85] - - [4096, 1024, 1, 3079] - - [460, 9851.1] + - [555, 9851.1] - - [1024, 4096, 1, 3415] - - [453, 9919.47] + - [548, 9919.47] - - [1024, 4096, 1, 3221] - - [460, 9908.18] + - [555, 9908.18] - - [1024, 4096, 1, 3978] - - [454, 9944.41] + - [549, 9944.41] - - [4096, 1024, 1, 3876] - - [460, 9898.99] + - [555, 9898.99] - - [1024, 4096, 1, 3528] - - [453, 9919.6] + - [548, 9919.6] - - [1024, 4096, 1, 3181] - - [461, 9894.86] + - [556, 9894.86] - - [4096, 1024, 1, 3445] - - [460, 9878.54] + - [555, 9878.54] - - [4096, 1024, 1, 3450] - - [453, 9864.82] + - [548, 9864.82] - - [4096, 1024, 1, 3377] - - [460, 9879.69] + - [555, 9879.69] - - [1024, 4096, 1, 3532] - - [454, 9928.19] + - [549, 9928.19] - - [1024, 33708, 1, 3944] - - [453, 10329.7] + - [548, 10329.7] - - [4096, 1024, 1, 3483] - - [460, 9861.83] + - [555, 9861.83] - - [1024, 4096, 1, 3358] - - [453, 9903.69] + - [548, 9903.69] - - [4096, 1024, 1, 3464] - - [460, 9876.84] + - [555, 9876.84] - - [4096, 1024, 1, 3282] - - [453, 9859.23] + - [548, 9859.23] - - [4096, 1024, 1, 3256] - - [461, 9855.1] + - [556, 9855.1] - - [1024, 4096, 1, 3057] - - [460, 9910.75] + - [555, 9910.75] - - [4096, 1024, 1, 3481] - - [460, 9866.29] + - [555, 9866.29] - - [4096, 1024, 1, 3340] - - [460, 9862.25] + - [555, 9862.25] - - [1024, 1024, 1, 3870] - - [459, 9082.45] + - [554, 9082.45] - - [1024, 4096, 1, 3273] - - [453, 9916.29] + - [548, 9916.29] - - [64, 65, 992, 65] - - [444, 4683.01] + - [539, 4683.01] - - [4096, 1024, 1, 3392] - - [454, 9881.12] + - [549, 9881.12] - - [4096, 1024, 1, 3337] - - [460, 9864.5] + - [555, 9864.5] - - [4096, 1024, 1, 3359] - - [460, 9874.42] + - [555, 9874.42] - - [4096, 1024, 1, 3498] - - [461, 9864.35] + - [556, 9864.35] - - [4096, 1024, 1, 3169] - - [460, 9851.1] + - [555, 9851.1] - - [1024, 33708, 1, 3859] - - [454, 10332.6] + - [549, 10332.6] - - [64, 19, 3264, 19] - - [430, 2182.14] + - [525, 2182.14] - - [1024, 4096, 1, 3103] - - [453, 9898.9] + - [548, 9898.9] - - [4096, 1024, 1, 3900] - - [460, 9897.12] + - [555, 9897.12] - - [1024, 4096, 1, 3442] - - [460, 9938.97] + - [555, 9938.97] - - [1024, 4096, 1, 3248] - - [460, 9939.92] + - [555, 9939.92] - - [1024, 4096, 1, 3351] - - [461, 9923.23] + - [556, 9923.23] - - [4096, 1024, 1, 3593] - - [460, 9894.36] + - [555, 9894.36] - - [1024, 4096, 1, 3780] - - [460, 9941.96] + - [555, 9941.96] - - [64, 133, 480, 133] - - [444, 6180.79] + - [539, 6180.79] - - [1024, 33708, 1, 3681] - - [453, 10332.3] + - [548, 10332.3] - - [4096, 1024, 1, 3374] - - [454, 9859.36] + - [549, 9859.36] - - [1024, 4096, 1, 3557] - - [453, 9928.2] + - [548, 9928.2] - - [4096, 1024, 1, 3906] - - [460, 9907.07] + - [555, 9907.07] - - [4096, 1024, 1, 3504] - - [460, 9886.05] + - [555, 9886.05] - - [1024, 4096, 1, 3270] - - [460, 9916.37] + - [555, 9916.37] - - [4096, 1024, 1, 3098] - - [453, 9854.76] + - [548, 9854.76] - - [64, 232, 272, 232] - - [449, 7394.1] + - [544, 7394.1] - - [4096, 1024, 1, 3216] - - [461, 9876.57] + - [556, 9876.57] - - [64, 148, 432, 148] - - [446, 6663.85] + - [541, 6663.85] - - [1024, 4096, 1, 3550] - - [460, 9920.28] + - [555, 9920.28] - - [4096, 1024, 1, 3449] - - [454, 9870.57] + - [549, 9870.57] - - [1024, 4096, 1, 3403] - - [461, 9908.21] + - [556, 9908.21] - - [1024, 4096, 1, 3523] - - [460, 9932.71] + - [555, 9932.71] - - [1024, 4096, 1, 3486] - - [460, 9917.46] + - [555, 9917.46] - - [1024, 4096, 1, 3564] - - [460, 9923.44] + - [555, 9923.44] - - [1024, 33708, 1, 4005] - - [453, 10339.5] + - [548, 10339.5] - - [4096, 1024, 1, 3296] - - [460, 9879.78] + - [555, 9879.78] - - [1024, 4096, 1, 3263] - - [453, 9907.17] + - [548, 9907.17] - - [64, 25, 2512, 25] - - [430, 2848.17] + - [525, 2848.17] - - [1024, 4096, 1, 3130] - - [461, 9900.1] + - [556, 9900.1] - - [1024, 4096, 1, 3295] - - [461, 9895.45] + - [556, 9895.45] - - [1024, 33708, 1, 3925] - - [454, 10342.3] + - [549, 10342.3] - - [1024, 4096, 1, 3378] - - [453, 9921.37] + - [548, 9921.37] - - [4096, 1024, 1, 3720] - - [461, 9885.82] + - [556, 9885.82] - - [4096, 1024, 1, 3399] - - [460, 9880.65] + - [555, 9880.65] - - [4096, 1024, 1, 3543] - - [461, 9870.73] + - [556, 9870.73] - - [64, 9, 6544, 9] - - [433, 955.17] + - [528, 955.17] - - [4096, 1024, 1, 3497] - - [460, 9868.43] + - [555, 9868.43] - - [4096, 1024, 1, 3594] - - [461, 9876.88] + - [556, 9876.88] - - [1024, 4096, 1, 3144] - - [461, 9901.96] + - [556, 9901.96] - - [1024, 4096, 1, 3975] - - [454, 9950.19] + - [549, 9950.19] - - [4096, 1024, 1, 3205] - - [461, 9856.07] + - [556, 9856.07] - - [1024, 33708, 1, 3995] - - [453, 10331.1] + - [548, 10331.1] - - [1024, 4096, 1, 3392] - - [453, 9935.78] + - [548, 9935.78] - - [1024, 4096, 1, 3055] - - [461, 9893.25] + - [556, 9893.25] - - [1024, 4096, 1, 4026] - - [461, 9940.22] + - [556, 9940.22] - - [4096, 1024, 1, 3557] - - [460, 9884.0] + - [555, 9884.0] - - [4096, 1024, 1, 3515] - - [460, 9871.94] + - [555, 9871.94] - - [4096, 1024, 1, 3486] - - [461, 9860.74] + - [556, 9860.74] - - [4096, 1024, 1, 3457] - - [461, 9885.37] + - [556, 9885.37] - - [1024, 4096, 1, 3511] - - [453, 9928.24] + - [548, 9928.24] - - [4096, 1024, 1, 3138] - - [460, 9854.06] + - [555, 9854.06] - - [1024, 4096, 1, 3339] - - [454, 9912.89] + - [549, 9912.89] - - [1024, 4096, 1, 3939] - - [454, 9952.26] + - [549, 9952.26] - - [4096, 1024, 1, 3500] - - [454, 9863.62] + - [549, 9863.62] - - [4096, 1024, 1, 3395] - - [461, 9883.82] + - [556, 9883.82] - - [4096, 1024, 1, 3968] - - [461, 9920.36] + - [556, 9920.36] - - [4096, 1024, 1, 4020] - - [461, 9912.81] + - [556, 9912.81] - - [4096, 1024, 1, 3942] - - [460, 9910.17] + - [555, 9910.17] - - [1024, 1024, 1, 4032] - - [450, 9024.74] + - [545, 9024.74] - - [4096, 1024, 1, 3349] - - [461, 9866.04] + - [556, 9866.04] - - [1024, 4096, 1, 3322] - - [454, 9908.43] + - [549, 9908.43] - - [4096, 1024, 1, 3452] - - [460, 9872.69] + - [555, 9872.69] - - [1024, 4096, 1, 3417] - - [460, 9912.64] + - [555, 9912.64] - - [1024, 1024, 1, 4012] - - [458, 9085.47] + - [553, 9085.47] - - [1024, 4096, 1, 3526] - - [454, 9920.36] + - [549, 9920.36] - - [4096, 1024, 1, 3485] - - [454, 9861.64] + - [549, 9861.64] - - [1024, 1024, 1, 3681] - - [458, 8991.46] + - [553, 8991.46] - - [4096, 1024, 1, 3303] - - [461, 9861.3] + - [556, 9861.3] - - [4096, 1024, 1, 3344] - - [461, 9892.44] + - [556, 9892.44] - - [1024, 4096, 1, 3479] - - [461, 9921.77] + - [556, 9921.77] - - [4096, 1024, 1, 3300] - - [460, 9868.64] + - [555, 9868.64] - - [1024, 4096, 1, 3439] - - [454, 9918.29] + - [549, 9918.29] - - [4096, 1024, 1, 3280] - - [461, 9875.29] + - [556, 9875.29] - - [1024, 4096, 1, 3245] - - [453, 9910.49] + - [548, 9910.49] - - [1024, 4096, 1, 3328] - - [453, 9941.6] + - [548, 9941.6] - - [4096, 1024, 1, 3418] - - [453, 9870.76] + - [548, 9870.76] - - [1024, 4096, 1, 3493] - - [461, 9938.45] + - [556, 9938.45] - - [1024, 4096, 1, 3500] - - [453, 9916.93] + - [548, 9916.93] - - [1024, 4096, 1, 3166] - - [453, 9898.12] + - [548, 9898.12] - - [4096, 1024, 1, 3126] - - [454, 9847.04] + - [549, 9847.04] - - [1024, 4096, 1, 3277] - - [461, 9898.66] + - [556, 9898.66] - - [1024, 4096, 1, 3315] - - [460, 9923.11] + - [555, 9923.11] - - [1024, 1024, 1, 3927] - - [451, 8987.71] + - [546, 8987.71] - - [1024, 4096, 1, 3414] - - [453, 9916.01] + - [548, 9916.01] - - [4096, 1024, 1, 3531] - - [460, 9871.92] + - [555, 9871.92] - - [4096, 1024, 1, 3484] - - [453, 9867.86] + - [548, 9867.86] - - [1024, 4096, 1, 3180] - - [460, 9904.09] + - [555, 9904.09] - - [4096, 1024, 1, 3360] - - [460, 9879.57] + - [555, 9879.57] - - [1024, 33708, 1, 3990] - - [453, 10335.0] + - [548, 10335.0] - - [4096, 1024, 1, 3466] - - [460, 9875.02] + - [555, 9875.02] - - [1024, 4096, 1, 3428] - - [453, 9916.02] + - [548, 9916.02] - - [1024, 4096, 1, 3137] - - [460, 9913.27] + - [555, 9913.27] - - [4096, 1024, 1, 4059] - - [460, 9901.86] + - [555, 9901.86] - - [1024, 4096, 1, 3353] - - [460, 9914.6] + - [555, 9914.6] - - [1024, 4096, 1, 3942] - - [460, 9944.5] + - [555, 9944.5] - - [4096, 1024, 1, 3506] - - [453, 9875.75] + - [548, 9875.75] - - [1024, 1024, 1, 3894] - - [451, 8946.55] + - [546, 8946.55] - - [4096, 1024, 1, 3508] - - [461, 9877.67] + - [556, 9877.67] - - [64, 132, 480, 135] - - [444, 6164.86] + - [539, 6164.86] - - [4096, 1024, 1, 3956] - - [453, 9907.83] + - [548, 9907.83] - - [64, 7, 8192, 7] - - [432, 813.078] + - [527, 813.078] - - [1024, 4096, 1, 3272] - - [454, 9909.82] + - [549, 9909.82] - - [1024, 4096, 1, 3443] - - [461, 9929.83] + - [556, 9929.83] - - [1024, 4096, 1, 3375] - - [461, 9909.23] + - [556, 9909.23] - - [1024, 4096, 1, 3525] - - [461, 9929.27] + - [556, 9929.27] - - [4096, 1024, 1, 3472] - - [460, 9889.97] + - [555, 9889.97] - - [1024, 4096, 1, 3520] - - [453, 9947.79] + - [548, 9947.79] - - [4096, 1024, 1, 3322] - - [460, 9862.98] + - [555, 9862.98] - - [4096, 1024, 1, 3387] - - [460, 9861.62] + - [555, 9861.62] - - [64, 8, 7280, 8] - - [438, 1024.1] + - [533, 1024.1] - - [1024, 33708, 1, 3939] - - [453, 10339.9] + - [548, 10339.9] - - [4096, 1024, 1, 3345] - - [461, 9873.68] + - [556, 9873.68] - - [4096, 1024, 1, 2967] - - [460, 9839.21] + - [555, 9839.21] - - [1024, 4096, 1, 3453] - - [453, 9905.81] + - [548, 9905.81] - - [1024, 4096, 1, 3640] - - [460, 9934.05] + - [555, 9934.05] - - [4096, 1024, 1, 3291] - - [454, 9860.84] + - [549, 9860.84] - - [1024, 4096, 1, 3350] - - [461, 9918.03] + - [556, 9918.03] - - [4096, 1024, 1, 3417] - - [460, 9864.61] + - [555, 9864.61] - - [64, 135, 480, 135] - - [444, 6265.45] + - [539, 6265.45] - - [1024, 4096, 1, 3467] - - [454, 9906.95] + - [549, 9906.95] - - [1024, 4096, 1, 3491] - - [460, 9933.3] + - [555, 9933.3] - - [1024, 4096, 1, 3822] - - [460, 9938.75] + - [555, 9938.75] - - [4096, 1024, 1, 3292] - - [460, 9849.21] + - [555, 9849.21] - - [1024, 4096, 1, 3231] - - [453, 9905.82] + - [548, 9905.82] - - [1024, 4096, 1, 3364] - - [454, 9930.32] + - [549, 9930.32] - - [1024, 4096, 1, 3995] - - [454, 9943.76] + - [549, 9943.76] - - [1024, 4096, 1, 3545] - - [453, 9928.53] + - [548, 9928.53] - - [1024, 1024, 1, 3876] - - [451, 9003.04] + - [546, 9003.04] - - [1024, 4096, 1, 3186] - - [453, 9921.01] + - [548, 9921.01] - - [4096, 1024, 1, 3432] - - [460, 9875.29] + - [555, 9875.29] - - [64, 84, 752, 85] - - [431, 5704.51] + - [526, 5704.51] - - [4096, 1024, 1, 3367] - - [454, 9868.06] + - [549, 9868.06] - - [4096, 1024, 1, 3503] - - [461, 9871.01] + - [556, 9871.01] - - [1024, 4096, 1, 3095] - - [454, 9902.9] + - [549, 9902.9] - - [4096, 1024, 1, 3465] - - [461, 9872.17] + - [556, 9872.17] - - [1024, 4096, 1, 3402] - - [460, 9914.66] + - [555, 9914.66] - - [4096, 1024, 1, 3140] - - [460, 9847.95] + - [555, 9847.95] - - [1024, 1024, 1, 4050] - - [457, 9055.75] + - [552, 9055.75] - - [4096, 1024, 1, 3424] - - [454, 9894.62] + - [549, 9894.62] - - [4096, 1024, 1, 3257] - - [453, 9860.97] + - [548, 9860.97] - - [4096, 1024, 1, 2917] - - [460, 9845.91] + - [555, 9845.91] - - [1024, 33708, 1, 3640] - - [453, 10321.7] + - [548, 10321.7] - - [1024, 4096, 1, 3456] - - [453, 9950.35] + - [548, 9950.35] - - [1024, 4096, 1, 3014] - - [453, 9907.97] + - [548, 9907.97] - - [4096, 1024, 1, 3372] - - [461, 9868.37] + - [556, 9868.37] - - [64, 132, 480, 132] - - [444, 6121.62] + - [539, 6121.62] - - [1024, 4096, 1, 3294] - - [461, 9903.23] + - [556, 9903.23] - - [4096, 1024, 1, 3446] - - [461, 9871.69] + - [556, 9871.69] - - [1024, 4096, 1, 3389] - - [454, 9909.27] + - [549, 9909.27] - - [4096, 1024, 1, 3259] - - [460, 9860.76] + - [555, 9860.76] - - [4096, 1024, 1, 3544] - - [460, 9878.76] + - [555, 9878.76] - - [4096, 1024, 1, 3479] - - [461, 9873.97] + - [556, 9873.97] - - [4096, 1024, 1, 3542] - - [460, 9878.97] + - [555, 9878.97] - - [4096, 1024, 1, 3321] - - [453, 9861.13] + - [548, 9861.13] - - [1024, 4096, 1, 3147] - - [453, 9894.77] + - [548, 9894.77] - - [1024, 4096, 1, 3944] - - [453, 9950.51] + - [548, 9950.51] - - [4096, 1024, 1, 3870] - - [461, 9881.74] + - [556, 9881.74] - - [1024, 4096, 1, 3308] - - [453, 9907.26] + - [548, 9907.26] - - [4096, 1024, 1, 3401] - - [460, 9864.59] + - [555, 9864.59] - - [1024, 4096, 1, 3395] - - [453, 9929.03] + - [548, 9929.03] - - [64, 99, 624, 102] - - [429, 5651.36] + - [524, 5651.36] - - [1024, 4096, 1, 3563] - - [460, 9922.76] + - [555, 9922.76] - - [1024, 33708, 1, 3870] - - [453, 10325.4] + - [548, 10325.4] - - [4096, 1024, 1, 3494] - - [460, 9875.37] + - [555, 9875.37] - - [1024, 4096, 1, 3271] - - [453, 9913.09] + - [548, 9913.09] - - [1024, 33708, 1, 3910] - - [453, 10341.5] + - [548, 10341.5] - - [1024, 4096, 1, 3287] - - [461, 9924.87] + - [556, 9924.87] - - [1024, 33708, 1, 3860] - - [453, 10330.7] + - [548, 10330.7] - - [64, 143, 432, 148] - - [446, 6571.78] + - [541, 6571.78] - - [1024, 1024, 1, 3584] - - [458, 8975.31] + - [553, 8975.31] - - [64, 162, 400, 162] - - [448, 6822.26] + - [543, 6822.26] - - [4096, 1024, 1, 3341] - - [460, 9854.66] + - [555, 9854.66] - - [1024, 4096, 1, 3136] - - [453, 9926.86] + - [548, 9926.86] - - [4096, 1024, 1, 3439] - - [460, 9854.33] + - [555, 9854.33] - - [64, 148, 432, 147] - - [444, 6677.61] + - [539, 6677.61] - - [1024, 4096, 1, 3751] - - [460, 9938.48] + - [555, 9938.48] - - [1024, 4096, 1, 3301] - - [460, 9919.15] + - [555, 9919.15] - - [4096, 1024, 1, 3468] - - [461, 9859.83] + - [556, 9859.83] - - [1024, 4096, 1, 3416] - - [461, 9918.52] + - [556, 9918.52] - - [4096, 1024, 1, 3163] - - [460, 9854.65] + - [555, 9854.65] - - [1024, 4096, 1, 3230] - - [454, 9897.54] + - [549, 9897.54] - - [1024, 4096, 1, 3581] - - [454, 9915.48] + - [549, 9915.48] - - [1024, 1024, 1, 3960] - - [456, 9045.86] + - [551, 9045.86] - - [4096, 1024, 1, 3463] - - [461, 9884.74] + - [556, 9884.74] - - [1024, 4096, 1, 3478] - - [454, 9927.02] + - [549, 9927.02] - - [4096, 1024, 1, 3262] - - [460, 9852.22] + - [555, 9852.22] - - [1024, 4096, 1, 3438] - - [460, 9912.68] + - [555, 9912.68] - - [1024, 4096, 1, 3244] - - [453, 9900.51] + - [548, 9900.51] - - [1024, 4096, 1, 3445] - - [453, 9920.32] + - [548, 9920.32] - - [4096, 1024, 1, 3328] - - [460, 9888.07] + - [555, 9888.07] - - [1024, 4096, 1, 3492] - - [454, 9937.22] + - [549, 9937.22] - - [4096, 1024, 1, 3211] - - [454, 9847.95] + - [549, 9847.95] - - [1024, 4096, 1, 3910] - - [461, 9946.57] + - [556, 9946.57] - - [1024, 4096, 1, 3314] - - [453, 9932.6] + - [548, 9932.6] - - [4096, 1024, 1, 3859] - - [460, 9902.84] + - [555, 9902.84] - - [4096, 1024, 1, 3383] - - [460, 9875.2] + - [555, 9875.2] - - [1024, 4096, 1, 3409] - - [461, 9926.79] + - [556, 9926.79] - - [1024, 4096, 1, 4020] - - [453, 9941.8] + - [548, 9941.8] - - [4096, 1024, 1, 3530] - - [460, 9872.81] + - [555, 9872.81] - - [4096, 1024, 1, 3411] - - [461, 9875.02] + - [556, 9875.02] - - [1024, 4096, 1, 3566] - - [461, 9921.1] + - [556, 9921.1] - - [4096, 1024, 1, 3493] - - [453, 9875.74] + - [548, 9875.74] - - [4096, 1024, 1, 3184] - - [460, 9873.14] + - [555, 9873.14] - - [1024, 4096, 1, 3072] - - [453, 9923.79] + - [548, 9923.79] - - [1024, 4096, 1, 3431] - - [454, 9911.03] + - [549, 9911.03] - - [4096, 1024, 1, 3306] - - [461, 9853.42] + - [556, 9853.42] - - [1024, 4096, 1, 3352] - - [461, 9913.32] + - [556, 9913.32] - - [4096, 1024, 1, 3295] - - [460, 9862.68] + - [555, 9862.68] - - [64, 123, 528, 122] - - [426, 6950.25] + - [521, 6950.25] - - [1024, 4096, 1, 3517] - - [454, 9920.06] + - [549, 9920.06] - - [64, 102, 624, 101] - - [434, 5791.49] + - [529, 5791.49] - - [4096, 1024, 1, 3426] - - [460, 9891.14] + - [555, 9891.14] - - [4096, 1024, 1, 3385] - - [460, 9868.41] + - [555, 9868.41] - - [1024, 1024, 1, 3978] - - [451, 9008.48] + - [546, 9008.48] - - [4096, 1024, 1, 3572] - - [453, 9884.81] + - [548, 9884.81] - - [4096, 1024, 1, 3459] - - [460, 9892.17] + - [555, 9892.17] - - [1024, 4096, 1, 3374] - - [461, 9908.52] + - [556, 9908.52] - - [4096, 1024, 1, 3166] - - [460, 9832.45] + - [555, 9832.45] - - [4096, 1024, 1, 3093] - - [461, 9841.25] + - [556, 9841.25] - - [4096, 1024, 1, 3523] - - [454, 9879.05] + - [549, 9879.05] - - [4096, 1024, 1, 3413] - - [454, 9880.81] + - [549, 9880.81] - - [1024, 4096, 1, 3996] - - [453, 9948.14] + - [548, 9948.14] - - [1024, 4096, 1, 3452] - - [461, 9915.97] + - [556, 9915.97] - - [4096, 1024, 1, 3232] - - [461, 9876.54] + - [556, 9876.54] - - [4096, 1024, 1, 3400] - - [453, 9867.15] + - [548, 9867.15] - - [4096, 1024, 1, 3334] - - [460, 9868.99] + - [555, 9868.99] - - [1024, 4096, 1, 3345] - - [453, 9920.6] + - [548, 9920.6] - - [1024, 4096, 1, 3538] - - [460, 9933.34] + - [555, 9933.34] - - [1024, 4096, 1, 3466] - - [460, 9920.85] + - [555, 9920.85] - - [4096, 1024, 1, 3315] - - [460, 9876.87] + - [555, 9876.87] - - [4096, 1024, 1, 3214] - - [461, 9847.93] + - [556, 9847.93] - - [1024, 33708, 1, 3900] - - [453, 10331.7] + - [548, 10331.7] - - [64, 160, 400, 160] - - [446, 7440.61] + - [541, 7440.61] - - [1024, 4096, 1, 3367] - - [460, 9926.32] + - [555, 9926.32] - - [1024, 4096, 1, 2917] - - [461, 9904.57] + - [556, 9904.57] - - [1024, 1024, 1, 3995] - - [452, 9000.33] + - [547, 9000.33] - - [64, 132, 480, 134] - - [444, 6146.88] + - [539, 6146.88] - - [1024, 4096, 1, 3544] - - [461, 9924.14] + - [556, 9924.14] - - [4096, 1024, 1, 3414] - - [461, 9867.9] + - [556, 9867.9] - - [4096, 1024, 1, 3565] - - [454, 9870.13] + - [549, 9870.13] - - [1024, 4096, 1, 3512] - - [460, 9919.84] + - [555, 9919.84] - - [1024, 4096, 1, 3191] - - [461, 9914.79] + - [556, 9914.79] - - [64, 27, 2336, 27] - - [428, 3054.71] + - [523, 3054.71] - - [1024, 4096, 1, 3289] - - [461, 9917.2] + - [556, 9917.2] - - [4096, 1024, 1, 3290] - - [460, 9858.41] + - [555, 9858.41] - - [1024, 4096, 1, 3211] - - [461, 9897.16] + - [556, 9897.16] - - [1024, 33708, 1, 3969] - - [454, 10336.1] + - [549, 10336.1] - - [4096, 1024, 1, 3566] - - [460, 9863.0] + - [555, 9863.0] - - [64, 111, 576, 111] - - [434, 6400.91] + - [529, 6400.91] - - [1024, 4096, 1, 3459] - - [460, 9923.03] + - [555, 9923.03] - - [1024, 4096, 1, 3372] - - [453, 9909.86] + - [548, 9909.86] - - [4096, 1024, 1, 3339] - - [460, 9859.3] + - [555, 9859.3] - - [4096, 1024, 1, 3425] - - [460, 9889.34] + - [555, 9889.34] - - [4096, 1024, 1, 3388] - - [460, 9871.67] + - [555, 9871.67] - - [1024, 4096, 1, 3531] - - [453, 9919.0] + - [548, 9919.0] - - [4096, 1024, 1, 3286] - - [461, 9868.42] + - [556, 9868.42] - - [4096, 1024, 1, 3462] - - [460, 9881.88] + - [555, 9881.88] - - [1024, 4096, 1, 3388] - - [453, 9904.69] + - [548, 9904.69] - - [4096, 1024, 1, 3165] - - [453, 9836.33] + - [548, 9836.33] - - [4096, 1024, 1, 3304] - - [460, 9857.55] + - [555, 9857.55] - - [1024, 4096, 1, 2736] - - [460, 9901.07] + - [555, 9901.07] - - [4096, 1024, 1, 3397] - - [460, 9872.1] + - [555, 9872.1] - - [64, 38, 1680, 38] - - [427, 3459.52] + - [522, 3459.52] - - [1024, 4096, 1, 3311] - - [461, 9908.32] + - [556, 9908.32] - - [1024, 4096, 1, 3394] - - [461, 9929.43] + - [556, 9929.43] - - [4096, 1024, 1, 2736] - - [460, 9833.88] + - [555, 9833.88] - - [1024, 4096, 1, 3559] - - [454, 9925.33] + - [549, 9925.33] - - [4096, 1024, 1, 3180] - - [460, 9838.05] + - [555, 9838.05] - - [1024, 4096, 1, 3480] - - [453, 9922.46] + - [548, 9922.46] - - [4096, 1024, 1, 3318] - - [460, 9867.87] + - [555, 9867.87] - - [4096, 1024, 1, 3213] - - [460, 9846.02] + - [555, 9846.02] - - [1024, 4096, 1, 3286] - - [460, 9912.14] + - [555, 9912.14] - - [4096, 1024, 1, 3471] - - [460, 9874.24] + - [555, 9874.24] - - [1024, 4096, 1, 3381] - - [461, 9922.96] + - [556, 9922.96] - - [64, 100, 624, 100] - - [435, 5705.24] + - [530, 5705.24] - - [4096, 1024, 1, 3502] - - [460, 9872.44] + - [555, 9872.44] - - [64, 16, 3840, 16] - - [441, 2091.67] + - [536, 2091.67] - - [1024, 4096, 1, 3552] - - [453, 9943.89] + - [548, 9943.89] - - [4096, 1024, 1, 3519] - - [461, 9869.95] + - [556, 9869.95] - - [1024, 4096, 1, 3300] - - [454, 9916.15] + - [549, 9916.15] - - [1024, 4096, 1, 3419] - - [453, 9914.06] + - [548, 9914.06] - - [4096, 1024, 1, 4030] - - [454, 9893.73] + - [549, 9893.73] - - [4096, 1024, 1, 3976] - - [461, 9898.35] + - [556, 9898.35] - - [1024, 4096, 1, 3473] - - [461, 9928.42] + - [556, 9928.42] - - [1024, 1024, 1, 3977] - - [458, 9009.33] + - [553, 9009.33] - - [4096, 1024, 1, 3428] - - [460, 9876.79] + - [555, 9876.79] - - [1024, 4096, 1, 3433] - - [454, 9923.92] + - [549, 9923.92] - - [4096, 1024, 1, 3534] - - [454, 9864.0] + - [549, 9864.0] - - [4096, 1024, 1, 3461] - - [460, 9873.12] + - [555, 9873.12] - - [4096, 1024, 1, 3681] - - [460, 9898.57] + - [555, 9898.57] - - [4096, 1024, 1, 3495] - - [461, 9876.08] + - [556, 9876.08] - - [4096, 1024, 1, 3351] - - [460, 9879.71] + - [555, 9879.71] - - [1024, 4096, 1, 4059] - - [453, 9948.61] + - [548, 9948.61] - - [4096, 1024, 1, 3990] - - [460, 9900.76] + - [555, 9900.76] - - [1024, 4096, 1, 3325] - - [454, 9903.3] + - [549, 9903.3] - - [1024, 4096, 1, 3408] - - [460, 9932.15] + - [555, 9932.15] - - [64, 59, 1088, 59] - - [434, 5343.77] + - [529, 5343.77] - - [4096, 1024, 1, 3394] - - [461, 9878.17] + - [556, 9878.17] - - [1024, 4096, 1, 3573] - - [461, 9935.3] + - [556, 9935.3] - - [4096, 1024, 1, 3386] - - [460, 9866.38] + - [555, 9866.38] - - [4096, 1024, 1, 3540] - - [460, 9882.33] + - [555, 9882.33] - - [1024, 4096, 1, 3182] - - [454, 9894.45] + - [549, 9894.45] - - [1024, 4096, 1, 3430] - - [453, 9915.24] + - [548, 9915.24] - - [1024, 4096, 1, 3236] - - [461, 9920.56] + - [556, 9920.56] - - [4096, 1024, 1, 2977] - - [460, 9848.08] + - [555, 9848.08] - - [1024, 4096, 1, 3355] - - [460, 9908.78] + - [555, 9908.78] - - [4096, 1024, 1, 3139] - - [460, 9850.71] + - [555, 9850.71] - - [4096, 1024, 1, 3516] - - [454, 9874.21] + - [549, 9874.21] - - [4096, 1024, 1, 3368] - - [454, 9872.64] + - [549, 9872.64] - - [4096, 1024, 1, 3559] - - [453, 9884.32] + - [548, 9884.32] - - [64, 11, 5456, 11] - - [441, 1382.67] + - [536, 1382.67] - - [1024, 4096, 1, 3506] - - [460, 9937.69] + - [555, 9937.69] - - [1024, 4096, 1, 3145] - - [453, 9905.11] + - [548, 9905.11] - - [1024, 4096, 1, 3369] - - [460, 9912.71] + - [555, 9912.71] - - [64, 112, 576, 112] - - [426, 6583.56] + - [521, 6583.56] - - [4096, 1024, 1, 3522] - - [460, 9889.47] + - [555, 9889.47] - - [1024, 33708, 1, 3894] - - [453, 10337.5] + - [548, 10337.5] - - [64, 159, 400, 162] - - [444, 7057.09] + - [539, 7057.09] - - [4096, 1024, 1, 3336] - - [460, 9867.67] + - [555, 9867.67] - - [1024, 4096, 1, 3382] - - [454, 9915.9] + - [549, 9915.9] - - [4096, 1024, 1, 3533] - - [460, 9878.56] + - [555, 9878.56] - - [4096, 1024, 1, 4050] - - [461, 9916.82] + - [556, 9916.82] - - [4096, 1024, 1, 3480] - - [454, 9869.32] + - [549, 9869.32] - - [1024, 4096, 1, 3344] - - [453, 9935.61] + - [548, 9935.61] - - [64, 122, 528, 122] - - [426, 6871.14] + - [521, 6871.14] - - [1024, 4096, 1, 3509] - - [454, 9925.8] + - [549, 9925.8] - - [1024, 4096, 1, 3956] - - [453, 9958.26] + - [548, 9958.26] - - [4096, 1024, 1, 3616] - - [460, 9904.63] + - [555, 9904.63] - - [1024, 4096, 1, 3366] - - [453, 9919.47] + - [548, 9919.47] - - [4096, 1024, 1, 2935] - - [453, 9833.23] + - [548, 9833.23] - - [4096, 1024, 1, 3393] - - [460, 9877.45] + - [555, 9877.45] - - [4096, 1024, 1, 3547] - - [454, 9865.1] + - [549, 9865.1] - - [1024, 4096, 1, 3499] - - [461, 9912.49] + - [556, 9912.49] - - [4096, 1024, 1, 3357] - - [460, 9855.28] + - [555, 9855.28] - - [4096, 1024, 1, 3272] - - [460, 9861.97] + - [555, 9861.97] - - [4096, 1024, 1, 3207] - - [460, 9847.78] + - [555, 9847.78] - - [4096, 1024, 1, 3894] - - [460, 9918.86] + - [555, 9918.86] - - [1024, 4096, 1, 3444] - - [460, 9932.71] + - [555, 9932.71] - - [4096, 1024, 1, 3561] - - [460, 9872.61] + - [555, 9872.61] - - [4096, 1024, 1, 3376] - - [460, 9885.59] + - [555, 9885.59] - - [1024, 4096, 1, 3458] - - [460, 9929.39] + - [555, 9929.39] - - [4096, 1024, 1, 3231] - - [454, 9847.08] + - [549, 9847.08] - - [64, 228, 272, 228] - - [455, 7302.69] + - [550, 7302.69] - - [1024, 4096, 1, 3505] - - [461, 9931.63] + - [556, 9931.63] - - [4096, 1024, 1, 3277] - - [460, 9857.2] + - [555, 9857.2] - - [64, 21, 2976, 21] - - [430, 2436.14] + - [525, 2436.14] - - [1024, 4096, 1, 3391] - - [460, 9911.25] + - [555, 9911.25] - - [64, 32, 1984, 32] - - [442, 3572.17] + - [537, 3572.17] - - [1024, 4096, 1, 3536] - - [461, 9946.9] + - [556, 9946.9] - - [1024, 4096, 1, 3063] - - [460, 9906.92] + - [555, 9906.92] - - [1024, 1024, 1, 3925] - - [452, 9011.45] + - [547, 9011.45] - - [1024, 4096, 1, 3189] - - [454, 9900.95] + - [549, 9900.95] - - [1024, 4096, 1, 2505] - - [460, 9854.85] + - [555, 9854.85] - - [4096, 1024, 1, 3454] - - [453, 9864.96] + - [548, 9864.96] - - [1024, 4096, 1, 3405] - - [461, 9906.33] + - [556, 9906.33] - - [1024, 33708, 1, 4050] - - [454, 10343.7] + - [549, 10343.7] - - [4096, 1024, 1, 3520] - - [460, 9887.03] + - [555, 9887.03] - - [64, 93, 688, 93] - - [437, 6222.86] + - [532, 6222.86] - - [1024, 4096, 1, 3487] - - [461, 9918.69] + - [556, 9918.69] - - [1024, 4096, 1, 3558] - - [461, 9930.99] + - [556, 9930.99] - - [4096, 1024, 1, 3297] - - [460, 9874.31] + - [555, 9874.31] - - [1024, 1024, 1, 3840] - - [456, 9075.42] + - [551, 9075.42] - - [1024, 4096, 1, 3483] - - [460, 9915.38] + - [555, 9915.38] - - [1024, 1024, 1, 3956] - - [459, 9010.03] + - [554, 9010.03] - - [1024, 33708, 1, 3751] - - [454, 10325.9] + - [549, 10325.9] - - [4096, 1024, 1, 3380] - - [460, 9888.47] + - [555, 9888.47] - - [1024, 4096, 1, 3380] - - [453, 9927.25] + - [548, 9927.25] - - [1024, 4096, 1, 3396] - - [461, 9931.96] + - [556, 9931.96] - - [1024, 4096, 1, 3497] - - [454, 9914.86] + - [549, 9914.86] - - [1024, 4096, 1, 3502] - - [461, 9921.52] + - [556, 9921.52] - - [1024, 1024, 1, 3976] - - [456, 9060.3] + - [551, 9060.3] - - [1024, 4096, 1, 3138] - - [454, 9908.66] + - [549, 9908.66] - - [4096, 1024, 1, 3939] - - [453, 9910.23] + - [548, 9910.23] - - [1024, 4096, 1, 3303] - - [454, 9916.64] + - [549, 9916.64] - - [64, 111, 576, 112] - - [434, 6495.19] + - [529, 6495.19] - - [1024, 4096, 1, 3418] - - [460, 9913.35] + - [555, 9913.35] - - [1024, 4096, 1, 3224] - - [454, 9904.05] + - [549, 9904.05] - - [4096, 1024, 1, 3978] - - [460, 9896.28] + - [555, 9896.28] - - [1024, 4096, 1, 3472] - - [453, 9937.48] + - [548, 9937.48] - - [4096, 1024, 1, 3353] - - [461, 9863.97] + - [556, 9863.97] - - [4096, 1024, 1, 3362] - - [460, 9871.06] + - [555, 9871.06] - - [1024, 33708, 1, 3978] - - [453, 10325.4] + - [548, 10325.4] - - [64, 100, 624, 102] - - [429, 5695.67] + - [524, 5695.67] - - [1024, 4096, 1, 3432] - - [461, 9915.56] + - [556, 9915.56] - - [1024, 4096, 1, 3139] - - [460, 9914.21] + - [555, 9914.21] - - [1024, 4096, 1, 3341] - - [461, 9912.1] + - [556, 9912.1] - - [1024, 4096, 1, 3494] - - [454, 9924.6] + - [549, 9924.6] - - [1024, 4096, 1, 3969] - - [453, 9952.28] + - [548, 9952.28] - - [1024, 4096, 1, 3163] - - [461, 9911.79] + - [556, 9911.79] - - [1024, 1024, 1, 3955] - - [451, 9097.86] + - [546, 9097.86] - - [4096, 1024, 1, 3405] - - [460, 9853.84] + - [555, 9853.84] - - [1024, 1024, 1, 4030] - - [451, 9083.86] + - [546, 9083.86] - - [4096, 1024, 1, 3453] - - [460, 9858.88] + - [555, 9858.88] - - [1024, 4096, 1, 3411] - - [461, 9926.54] + - [556, 9926.54] - - [1024, 4096, 1, 3527] - - [454, 9922.65] + - [549, 9922.65] - - [4096, 1024, 1, 3474] - - [460, 9878.49] + - [555, 9878.49] - - [1024, 4096, 1, 3572] - - [460, 9932.0] + - [555, 9932.0] - - [4096, 1024, 1, 3293] - - [460, 9848.26] + - [555, 9848.26] - - [4096, 1024, 1, 3247] - - [460, 9861.45] + - [555, 9861.45] - - [64, 15, 4096, 15] - - [441, 1955.75] + - [536, 1955.75] - - [1024, 4096, 1, 3425] - - [461, 9936.4] + - [556, 9936.4] - - [1024, 4096, 1, 3354] - - [453, 9917.55] + - [548, 9917.55] - - [4096, 1024, 1, 3382] - - [460, 9885.49] + - [555, 9885.49] - - [4096, 1024, 1, 3236] - - [460, 9860.6] + - [555, 9860.6] - - [1024, 4096, 1, 3519] - - [461, 9919.3] + - [556, 9919.3] - - [4096, 1024, 1, 3354] - - [460, 9854.75] + - [555, 9854.75] - - [4096, 1024, 1, 3501] - - [461, 9869.62] + - [556, 9869.62] - - [1024, 1024, 1, 3906] - - [459, 9104.99] + - [554, 9104.99] - - [4096, 1024, 1, 3266] - - [460, 9873.97] + - [555, 9873.97] - - [64, 101, 624, 102] - - [429, 5765.52] + - [524, 5765.52] - - [1024, 4096, 1, 3368] - - [460, 9909.77] + - [555, 9909.77] - - [1024, 4096, 1, 4030] - - [461, 9940.27] + - [556, 9940.27] - - [1024, 4096, 1, 3533] - - [454, 9916.64] + - [549, 9916.64] - - [4096, 1024, 1, 3332] - - [461, 9876.45] + - [556, 9876.45] - - [4096, 1024, 1, 3584] - - [460, 9896.6] + - [555, 9896.6] - - [1024, 4096, 1, 3616] - - [460, 9957.18] + - [555, 9957.18] - - [4096, 1024, 1, 3265] - - [460, 9877.78] + - [555, 9877.78] - - [4096, 1024, 1, 3361] - - [460, 9888.61] + - [555, 9888.61] - - [4096, 1024, 1, 3467] - - [460, 9863.4] + - [555, 9863.4] - - [1024, 4096, 1, 3454] - - [454, 9904.89] + - [549, 9904.89] - - [1024, 4096, 1, 3101] - - [461, 9893.12] + - [556, 9893.12] - - [1024, 4096, 1, 3508] - - [461, 9931.54] + - [556, 9931.54] - - [4096, 1024, 1, 3267] - - [460, 9864.48] + - [555, 9864.48] - - [64, 54, 1184, 54] - - [426, 4906.02] + - [521, 4906.02] - - [4096, 1024, 1, 3419] - - [460, 9872.56] + - [555, 9872.56] - - [4096, 1024, 1, 3822] - - [460, 9892.63] + - [555, 9892.63] - - [1024, 4096, 1, 3266] - - [460, 9918.58] + - [555, 9918.58] - - [4096, 1024, 1, 3440] - - [461, 9890.16] + - [556, 9890.16] - - [1024, 4096, 1, 3361] - - [460, 9930.97] + - [555, 9930.97] - - [1024, 4096, 1, 3546] - - [454, 9926.56] + - [549, 9926.56] - - [4096, 1024, 1, 3473] - - [460, 9889.06] + - [555, 9889.06] - - [4096, 1024, 1, 3546] - - [461, 9872.27] + - [556, 9872.27] - - [1024, 4096, 1, 3088] - - [454, 9918.03] + - [549, 9918.03] - - [1024, 4096, 1, 3535] - - [461, 9921.2] + - [556, 9921.2] - - [1024, 4096, 1, 3447] - - [461, 9920.63] + - [556, 9920.63] - - [1024, 4096, 1, 3560] - - [460, 9925.48] + - [555, 9925.48] - - [1024, 4096, 1, 3422] - - [454, 9922.21] + - [549, 9922.21] - - [1024, 4096, 1, 3469] - - [453, 9906.18] + - [548, 9906.18] - - [4096, 1024, 1, 3488] - - [460, 9903.26] + - [555, 9903.26] - - [1024, 4096, 1, 3110] - - [460, 9906.76] + - [555, 9906.76] - - [1024, 4096, 1, 3265] - - [461, 9916.69] + - [556, 9916.69] - - [1024, 4096, 1, 3291] - - [460, 9902.73] + - [555, 9902.73] - - [1024, 4096, 1, 3390] - - [461, 9907.22] + - [556, 9907.22] - - [4096, 1024, 1, 3046] - - [460, 9847.68] + - [555, 9847.68] - - [1024, 4096, 1, 3539] - - [461, 9933.49] + - [556, 9933.49] - - [4096, 1024, 1, 3221] - - [461, 9860.74] + - [556, 9860.74] - - [4096, 1024, 1, 3433] - - [460, 9872.74] + - [555, 9872.74] - - [4096, 1024, 1, 3364] - - [461, 9881.91] + - [556, 9881.91] - - [4096, 1024, 1, 3470] - - [460, 9858.56] + - [555, 9858.56] - - [1024, 4096, 1, 3404] - - [453, 9907.27] + - [548, 9907.27] - - [1024, 33708, 1, 3968] - - [454, 10350.3] + - [549, 10350.3] - - [4096, 1024, 1, 3088] - - [460, 9869.06] + - [555, 9869.06] - - [1024, 4096, 1, 3247] - - [460, 9901.02] + - [555, 9901.02] - - [1024, 33708, 1, 3996] - - [453, 10328.5] + - [548, 10328.5] - - [4096, 1024, 1, 3482] - - [461, 9866.99] + - [556, 9866.99] - - [1024, 1024, 1, 3796] - - [456, 9031.68] + - [551, 9031.68] - - [4096, 1024, 1, 3995] - - [461, 9896.78] + - [556, 9896.78] - - [1024, 1024, 1, 3859] - - [458, 9097.36] + - [553, 9097.36] - - [1024, 4096, 1, 3280] - - [454, 9934.05] + - [549, 9934.05] - - [4096, 1024, 1, 3271] - - [461, 9860.09] + - [556, 9860.09] - - [64, 10, 5952, 10] - - [441, 1221.02] + - [536, 1221.02] - - [4096, 1024, 1, 3545] - - [460, 9877.35] + - [555, 9877.35] - - [4096, 1024, 1, 3476] - - [453, 9882.57] + - [548, 9882.57] - - [4096, 1024, 1, 3496] - - [454, 9880.5] + - [549, 9880.5] - - [4096, 1024, 1, 3191] - - [454, 9858.7] + - [549, 9858.7] - - [4096, 1024, 1, 3311] - - [461, 9853.2] + - [556, 9853.2] - - [1024, 4096, 1, 3302] - - [461, 9919.32] + - [556, 9919.32] - - [1024, 4096, 1, 3681] - - [460, 9944.99] + - [555, 9944.99] - - [4096, 1024, 1, 3582] - - [453, 9869.77] + - [548, 9869.77] - - [4096, 1024, 1, 3421] - - [461, 9856.08] + - [556, 9856.08] - - [4096, 1024, 1, 3560] - - [454, 9884.48] + - [549, 9884.48] - - [1024, 4096, 1, 3495] - - [461, 9930.13] + - [556, 9930.13] - - [4096, 1024, 1, 3186] - - [460, 9870.59] + - [555, 9870.59] - - [4096, 1024, 1, 3925] - - [460, 9904.0] + - [555, 9904.0] - - [64, 71, 896, 71] - - [445, 5004.79] + - [540, 5004.79] - - [1024, 4096, 1, 3435] - - [461, 9916.58] + - [556, 9916.58] - - [4096, 1024, 1, 3434] - - [460, 9871.29] + - [555, 9871.29] - - [1024, 33708, 1, 4012] - - [453, 10332.5] + - [548, 10332.5] - - [1024, 4096, 1, 3340] - - [453, 9918.11] + - [548, 9918.11] - - [1024, 1024, 1, 3860] - - [451, 8999.36] + - [546, 8999.36] - - [4096, 1024, 1, 3489] - - [460, 9882.02] + - [555, 9882.02] - - [1024, 4096, 1, 3162] - - [461, 9906.28] + - [556, 9906.28] - - [4096, 1024, 1, 3436] - - [460, 9858.12] + - [555, 9858.12] - - [1024, 1024, 1, 4005] - - [457, 9043.06] + - [552, 9043.06] - - [64, 84, 752, 84] - - [430, 5629.93] + - [525, 5629.93] - - [4096, 1024, 1, 3574] - - [460, 9886.7] + - [555, 9886.7] - - [4096, 1024, 1, 3469] - - [453, 9856.26] + - [548, 9856.26] - - [1024, 4096, 1, 3410] - - [454, 9924.74] + - [549, 9924.74] - - [1024, 4096, 1, 3216] - - [453, 9930.67] + - [548, 9930.67] - - [4096, 1024, 1, 3095] - - [460, 9847.01] + - [555, 9847.01] - - [1024, 1024, 1, 3990] - - [459, 9089.04] + - [554, 9089.04] - - [4096, 1024, 1, 3448] - - [460, 9863.94] + - [555, 9863.94] - - [1024, 4096, 1, 3176] - - [461, 9914.01] + - [556, 9914.01] - - [64, 49, 1296, 49] - - [426, 4437.46] + - [521, 4437.46] - - [4096, 1024, 1, 2918] - - [460, 9830.93] + - [555, 9830.93] - - [64, 14, 4368, 14] - - [440, 1802.47] + - [535, 1802.47] - - [1024, 4096, 1, 3424] - - [460, 9934.05] + - [555, 9934.05] - - [4096, 1024, 1, 3402] - - [453, 9863.12] + - [548, 9863.12] - - [4096, 1024, 1, 3145] - - [454, 9856.56] + - [549, 9856.56] - - [64, 134, 480, 134] - - [446, 6184.05] + - [541, 6184.05] - - [1024, 33708, 1, 3976] - - [454, 10330.1] + - [549, 10330.1] - - [4096, 1024, 1, 3518] - - [453, 9856.07] + - [548, 9856.07] - - [4096, 1024, 1, 3110] - - [460, 9856.46] + - [555, 9856.46] - - [4096, 1024, 1, 3325] - - [460, 9852.36] + - [555, 9852.36] - - [1024, 33708, 1, 3999] - - [453, 10329.7] + - [548, 10329.7] - - [4096, 1024, 1, 2985] - - [460, 9837.3] + - [555, 9837.3] - - [1024, 4096, 1, 3371] - - [453, 9913.03] + - [548, 9913.03] - - [4096, 1024, 1, 3342] - - [460, 9863.16] + - [555, 9863.16] - - [4096, 1024, 1, 3141] - - [454, 9849.91] + - [549, 9849.91] - - [4096, 1024, 1, 3532] - - [454, 9866.3] + - [549, 9866.3] - - [64, 78, 816, 78] - - [431, 5316.88] + - [526, 5316.88] - - [1024, 4096, 1, 3169] - - [461, 9910.45] + - [556, 9910.45] - - [1024, 4096, 1, 3514] - - [460, 9918.0] + - [555, 9918.0] - - [4096, 1024, 1, 3780] - - [461, 9899.75] + - [556, 9899.75] - - [1024, 4096, 1, 3098] - - [453, 9901.62] + - [548, 9901.62] - - [1024, 4096, 1, 3449] - - [461, 9919.85] + - [556, 9919.85] - - [1024, 4096, 1, 3222] - - [453, 9917.66] + - [548, 9917.66] - - [1024, 4096, 1, 3346] - - [454, 9912.91] + - [549, 9912.91] - - [4096, 1024, 1, 3064] - - [461, 9848.79] + - [556, 9848.79] - - [4096, 1024, 1, 3511] - - [460, 9873.39] + - [555, 9873.39] - - [4096, 1024, 1, 3384] - - [460, 9870.98] + - [555, 9870.98] - - [4096, 1024, 1, 3356] - - [454, 9853.45] + - [549, 9853.45] - - [1024, 4096, 1, 3796] - - [453, 9940.66] + - [548, 9940.66] - - [4096, 1024, 1, 3427] - - [460, 9883.14] + - [555, 9883.14] - - [4096, 1024, 1, 3390] - - [460, 9863.79] + - [555, 9863.79] - - [4096, 1024, 1, 3573] - - [461, 9886.02] + - [556, 9886.02] - - [4096, 1024, 1, 3456] - - [454, 9890.61] + - [549, 9890.61] - - [1024, 4096, 1, 3360] - - [461, 9938.1] + - [556, 9938.1] - - [1024, 33708, 1, 3977] - - [454, 10327.2] + - [549, 10327.2] - - [1024, 4096, 1, 2918] - - [453, 9902.84] + - [548, 9902.84] - - [4096, 1024, 1, 3975] - - [460, 9905.27] + - [555, 9905.27] - - [4096, 1024, 1, 3525] - - [461, 9879.91] + - [556, 9879.91] - - [4096, 1024, 1, 3398] - - [453, 9873.91] + - [548, 9873.91] - - [4096, 1024, 1, 3640] - - [460, 9885.16] + - [555, 9885.16] - - [1024, 1024, 1, 3999] - - [452, 8995.42] + - [547, 8995.42] - - [4096, 1024, 1, 3014] - - [460, 9841.32] + - [555, 9841.32] - - [1024, 4096, 1, 3446] - - [453, 9917.21] + - [548, 9917.21] - - [1024, 33708, 1, 3796] - - [453, 10339.0] + - [548, 10339.0] - - [4096, 1024, 1, 3101] - - [453, 9827.34] + - [548, 9827.34] - - [4096, 1024, 1, 3563] - - [461, 9863.03] + - [556, 9863.03] - - [4096, 1024, 1, 3539] - - [453, 9889.54] + - [548, 9889.54] - - [4096, 1024, 1, 3182] - - [460, 9833.79] + - [555, 9833.79] - - [1024, 4096, 1, 3468] - - [454, 9913.05] + - [549, 9913.05] - - [4096, 1024, 1, 3312] - - [460, 9889.85] + - [555, 9889.85] - - [4096, 1024, 1, 3215] - - [460, 9853.88] + - [555, 9853.88] - - [4096, 1024, 1, 3910] - - [460, 9894.72] + - [555, 9894.72] - - [1024, 33708, 1, 3780] - - [454, 10332.0] + - [549, 10332.0] - - [1024, 4096, 1, 3290] - - [460, 9915.08] + - [555, 9915.08] - - [1024, 4096, 1, 4012] - - [460, 9942.65] + - [555, 9942.65] - - [1024, 4096, 1, 3385] - - [460, 9915.83] + - [555, 9915.83] - - [1024, 33708, 1, 3975] - - [453, 10330.1] + - [548, 10330.1] - - [4096, 1024, 1, 3996] - - [460, 9891.31] + - [555, 9891.31] - - [4096, 1024, 1, 2765] - - [461, 9800.38] + - [556, 9800.38] - - [4096, 1024, 1, 3538] - - [461, 9886.22] + - [556, 9886.22] - - [4096, 1024, 1, 3415] - - [461, 9874.6] + - [556, 9874.6] - - [1024, 4096, 1, 3554] - - [460, 9931.99] + - [555, 9931.99] - - [4096, 1024, 1, 3513] - - [454, 9874.25] + - [549, 9874.25] - - [1024, 4096, 1, 3304] - - [454, 9907.73] + - [549, 9907.73] - - [4096, 1024, 1, 3294] - - [460, 9851.25] + - [555, 9851.25] - - [4096, 1024, 1, 3396] - - [461, 9880.7] + - [556, 9880.7] - - [1024, 4096, 1, 3213] - - [454, 9891.12] + - [549, 9891.12] - - [4096, 1024, 1, 3137] - - [454, 9857.41] + - [549, 9857.41] - - [4096, 1024, 1, 3552] - - [460, 9904.22] + - [555, 9904.22] - - [1024, 1024, 1, 4020] - - [459, 9098.87] + - [554, 9098.87] - - [64, 13, 4672, 13] - - [441, 1693.54] + - [536, 1693.54] - - [1024, 4096, 1, 3461] - - [460, 9918.45] + - [555, 9918.45] - - [4096, 1024, 1, 3263] - - [453, 9843.89] + - [548, 9843.89] - - [4096, 1024, 1, 3430] - - [460, 9885.26] + - [555, 9885.26] - - [4096, 1024, 1, 3389] - - [460, 9859.23] + - [555, 9859.23] - - [4096, 1024, 1, 3528] - - [460, 9873.01] + - [555, 9873.01] - - [1024, 4096, 1, 3463] - - [461, 9929.61] + - [556, 9929.61] - - [4096, 1024, 1, 3526] - - [461, 9876.9] + - [556, 9876.9] - - [4096, 1024, 1, 3154] - - [460, 9858.25] + - [555, 9858.25] - - [4096, 1024, 1, 3499] - - [461, 9862.92] + - [556, 9862.92] - - [1024, 1024, 1, 3939] - - [459, 9107.41] + - [554, 9107.41] - - [4096, 1024, 1, 3955] - - [461, 9906.28] + - [556, 9906.28] - - [1024, 4096, 1, 3297] - - [454, 9925.34] + - [549, 9925.34] - - [1024, 4096, 1, 3233] - - [460, 9920.65] + - [555, 9920.65] - - [1024, 4096, 1, 3226] - - [460, 9911.35] + - [555, 9911.35] - - [4096, 1024, 1, 3404] - - [460, 9867.28] + - [555, 9867.28] - - [4096, 1024, 1, 3355] - - [460, 9862.66] + - [555, 9862.66] - - [1024, 4096, 1, 3542] - - [460, 9926.49] + - [555, 9926.49] - - [4096, 1024, 1, 3181] - - [461, 9831.86] + - [556, 9831.86] - - [1024, 4096, 1, 3474] - - [460, 9928.03] + - [555, 9928.03] - - [4096, 1024, 1, 3319] - - [460, 9870.28] + - [555, 9870.28] - - [1024, 4096, 1, 3434] - - [453, 9917.51] + - [548, 9917.51] - - [1024, 4096, 1, 3860] - - [460, 9945.32] + - [555, 9945.32] - - [1024, 4096, 1, 3343] - - [453, 9914.66] + - [548, 9914.66] - - [64, 77, 816, 78] - - [431, 5276.97] + - [526, 5276.97] - - [1024, 4096, 1, 3488] - - [460, 9945.81] + - [555, 9945.81] - - [1024, 4096, 1, 3046] - - [460, 9908.78] + - [555, 9908.78] - - [1024, 4096, 1, 3141] - - [461, 9909.18] + - [556, 9909.18] - - [1024, 4096, 1, 3516] - - [461, 9911.38] + - [556, 9911.38] - - [4096, 1024, 1, 3147] - - [460, 9840.47] + - [555, 9840.47] - - [1024, 1024, 1, 4059] - - [452, 9009.78] + - [547, 9009.78] - - [1024, 1024, 1, 3944] - - [452, 9006.17] + - [547, 9006.17] - - [1024, 4096, 1, 3421] - - [461, 9919.86] + - [556, 9919.86] - - [4096, 1024, 1, 3944] - - [454, 9899.53] + - [549, 9899.53] - - [64, 45, 1424, 45] - - [439, 4068.67] + - [534, 4068.67] - - [1024, 4096, 1, 3574] - - [454, 9930.19] + - [549, 9930.19] - - [1024, 4096, 1, 3977] - - [453, 9944.28] + - [548, 9944.28] - - [1024, 1024, 1, 3968] - - [458, 9045.22] + - [553, 9045.22] - - [1024, 4096, 1, 2985] - - [460, 9887.65] + - [555, 9887.65] - - [64, 193, 320, 193] - - [447, 6631.35] + - [542, 6631.35] - - [1024, 4096, 1, 3427] - - [461, 9933.41] + - [556, 9933.41] - - [64, 12, 5040, 12] - - [441, 1552.53] + - [536, 1552.53] - - [1024, 4096, 1, 3482] - - [461, 9942.22] + - [556, 9942.22] - - [1024, 4096, 1, 3332] - - [453, 9923.58] + - [548, 9923.58] - - [1024, 1024, 1, 3720] - - [457, 9039.56] + - [552, 9039.56] - - [4096, 1024, 1, 3308] - - [461, 9852.66] + - [556, 9852.66] - - [1024, 4096, 1, 3513] - - [461, 9919.99] + - [556, 9919.99] - - [1024, 4096, 1, 3154] - - [454, 9908.46] + - [549, 9908.46] - - [1024, 4096, 1, 3955] - - [461, 9950.01] + - [556, 9950.01] - - [1024, 4096, 1, 2967] - - [461, 9897.44] + - [556, 9897.44] - - [1024, 33708, 1, 3942] - - [453, 10336.1] + - [548, 10336.1] - - [1024, 4096, 1, 3319] - - [461, 9912.45] + - [556, 9912.45] - - [4096, 1024, 1, 3860] - - [460, 9909.29] + - [555, 9909.29] - - [1024, 4096, 1, 3548] - - [453, 9924.21] + - [548, 9924.21] - - [4096, 1024, 1, 3977] - - [461, 9891.44] + - [556, 9891.44] - - [4096, 1024, 1, 3535] - - [460, 9867.84] + - [555, 9867.84] - - [1024, 4096, 1, 3541] - - [461, 9923.16] + - [556, 9923.16] - - [1024, 1024, 1, 3910] - - [458, 9080.4] + - [553, 9080.4] - - [1024, 33708, 1, 3584] - - [453, 10333.0] + - [548, 10333.0] - - [1024, 4096, 1, 3168] - - [454, 9926.27] + - [549, 9926.27] - - [1024, 4096, 1, 3448] - - [461, 9922.42] + - [556, 9922.42] - - [4096, 1024, 1, 3343] - - [460, 9857.23] + - [555, 9857.23] - - [64, 35, 1808, 35] - - [443, 3175.44] + - [538, 3175.44] - - [1024, 4096, 1, 3357] - - [454, 9902.41] + - [549, 9902.41] - - [64, 143, 432, 143] - - [444, 6489.7] + - [539, 6489.7] - - [4096, 1024, 1, 3510] - - [460, 9867.4] + - [555, 9867.4] - - [4096, 1024, 1, 3369] - - [460, 9863.44] + - [555, 9863.44] - - [64, 92, 688, 93] - - [431, 6188.3] + - [526, 6188.3] - - [4096, 1024, 1, 3379] - - [460, 9870.12] + - [555, 9870.12] - - [1024, 4096, 1, 3276] - - [460, 9904.77] + - [555, 9904.77] - - [1024, 4096, 1, 3363] - - [460, 9925.13] + - [555, 9925.13] - - [4096, 1024, 1, 3055] - - [460, 9831.92] + - [555, 9831.92] - - [1024, 4096, 1, 3524] - - [453, 9923.79] + - [548, 9923.79] - - [4096, 1024, 1, 3057] - - [460, 9852.87] + - [555, 9852.87] - - [1024, 33708, 1, 3720] - - [454, 10327.1] + - [549, 10327.1] - - [1024, 4096, 1, 3383] - - [453, 9919.39] + - [548, 9919.39] - - [1024, 4096, 1, 3522] - - [454, 9932.56] + - [549, 9932.56] - - [1024, 33708, 1, 3956] - - [453, 10333.8] + - [548, 10333.8] - - [1024, 4096, 1, 3481] - - [453, 9922.08] + - [548, 9922.08] - - [4096, 1024, 1, 3562] - - [461, 9874.86] + - [556, 9874.86] - - [4096, 1024, 1, 3299] - - [460, 9872.97] + - [555, 9872.97] - - [1024, 4096, 1, 3262] - - [454, 9924.83] + - [549, 9924.83] - - [1024, 4096, 1, 3840] - - [453, 9961.84] + - [548, 9961.84] - - [1024, 33708, 1, 4026] - - [453, 10334.3] + - [548, 10334.3] - - [4096, 1024, 1, 3168] - - [454, 9878.45] + - [549, 9878.45] - - [64, 101, 624, 101] - - [434, 5734.72] + - [529, 5734.72] - - [1024, 4096, 1, 3999] - - [453, 9947.1] + - [548, 9947.1] - - [1024, 4096, 1, 3549] - - [453, 9923.3] + - [548, 9923.3] - - [4096, 1024, 1, 3375] - - [460, 9868.89] + - [555, 9868.89] - - [1024, 4096, 1, 3496] - - [461, 9928.67] + - [556, 9928.67] - - [64, 29, 2176, 29] - - [430, 3290.02] + - [525, 3290.02] - - [1024, 4096, 1, 3190] - - [461, 9897.61] + - [556, 9897.61] - - [4096, 1024, 1, 3273] - - [461, 9853.65] + - [556, 9853.65] - - [1024, 4096, 1, 3406] - - [460, 9907.04] + - [555, 9907.04] - - [4096, 1024, 1, 4005] - - [453, 9907.97] + - [548, 9907.97] - - [4096, 1024, 1, 3555] - - [460, 9878.96] + - [555, 9878.96] - - [4096, 1024, 1, 2505] - - [460, 9785.1] + - [555, 9785.1] - - [1024, 4096, 1, 3460] - - [460, 9930.24] + - [555, 9930.24] - - [64, 17, 3632, 17] - - [431, 1917.27] + - [526, 1917.27] - - [1024, 4096, 1, 3579] - - [454, 9920.94] + - [549, 9920.94] - - [1024, 33708, 1, 4030] - - [454, 10327.7] + - [549, 10327.7] - - [1024, 4096, 1, 3510] - - [454, 9931.31] + - [549, 9931.31] - - [1024, 1024, 1, 3969] - - [451, 9020.83] + - [546, 9020.83] - - [1024, 4096, 1, 3282] - - [461, 9920.05] + - [556, 9920.05] - - [1024, 4096, 1, 3377] - - [453, 9927.34] + - [548, 9927.34] - - [1024, 4096, 1, 2935] - - [461, 9903.48] + - [556, 9903.48] - - [64, 41, 1552, 41] - - [431, 3740.48] + - [526, 3740.48] - - [1024, 4096, 1, 3498] - - [453, 9915.01] + - [548, 9915.01] - - [1024, 4096, 1, 3593] - - [460, 9925.64] + - [555, 9925.64] - - [1024, 1024, 1, 3948] - - [459, 9009.03] + - [554, 9009.03] - - [4096, 1024, 1, 3226] - - [461, 9854.75] + - [556, 9854.75] - - [1024, 4096, 1, 2499] - - [460, 9904.82] + - [555, 9904.82] - - [1024, 4096, 1, 3296] - - [453, 9926.89] + - [548, 9926.89] - - [1024, 4096, 1, 3455] - - [460, 9917.52] + - [555, 9917.52] - - [1024, 4096, 1, 3399] - - [454, 9919.7] + - [549, 9919.7] - - [1024, 4096, 1, 3205] - - [453, 9917.74] + - [548, 9917.74] - - [4096, 1024, 1, 4026] - - [461, 9897.81] + - [556, 9897.81] - - [1024, 4096, 1, 3484] - - [453, 9915.53] + - [548, 9915.53] - - [4096, 1024, 1, 3302] - - [461, 9862.8] + - [556, 9862.8] - - [1024, 4096, 1, 3485] - - [461, 9913.0] + - [556, 9913.0] - - [1024, 1024, 1, 3996] - - [459, 9008.77] + - [554, 9008.77] - - [1024, 4096, 1, 3126] - - [454, 9910.16] + - [549, 9910.16] - - [1024, 4096, 1, 4050] - - [453, 9951.21] + - [548, 9951.21] - - [4096, 1024, 1, 3235] - - [454, 9870.74] + - [549, 9870.74] - - [1024, 33708, 1, 3955] - - [453, 10336.1] + - [548, 10336.1] - - [1024, 4096, 1, 3342] - - [453, 9903.85] + - [548, 9903.85] - - [1024, 1024, 1, 3900] - - [458, 9082.92] + - [553, 9082.92] - - [1024, 4096, 1, 3397] - - [461, 9922.7] + - [556, 9922.7] - - [4096, 1024, 1, 3491] - - [461, 9880.75] + - [556, 9880.75] - - [1024, 4096, 1, 3503] - - [453, 9923.28] + - [548, 9923.28] - - [1024, 4096, 1, 3140] - - [454, 9908.41] + - [549, 9908.41] - - [4096, 1024, 1, 3121] - - [460, 9860.32] + - [555, 9860.32] - - [4096, 1024, 1, 3276] - - [460, 9854.19] + - [555, 9854.19] - - [1024, 4096, 1, 3321] - - [461, 9917.86] + - [556, 9917.86] - - [1024, 4096, 1, 3870] - - [461, 9931.07] + - [556, 9931.07] - - [4096, 1024, 1, 3475] - - [460, 9877.58] + - [555, 9877.58] - - [1024, 4096, 1, 2984] - - [460, 9895.59] + - [555, 9895.59] - - [4096, 1024, 1, 3363] - - [454, 9873.44] + - [549, 9873.44] - - [1024, 4096, 1, 3582] - - [460, 9920.87] + - [555, 9920.87] - - [4096, 1024, 1, 3509] - - [460, 9886.86] + - [555, 9886.86] - - [1024, 4096, 1, 3426] - - [453, 9928.86] + - [548, 9928.86] - - [4096, 1024, 1, 3136] - - [460, 9872.61] + - [555, 9872.61] - - [1024, 4096, 1, 3232] - - [461, 9926.29] + - [556, 9926.29] - - [4096, 1024, 1, 3103] - - [460, 9839.03] + - [555, 9839.03] - - [1024, 4096, 1, 3335] - - [454, 9913.37] + - [549, 9913.37] - - [1024, 4096, 1, 3900] - - [453, 9938.01] + - [548, 9938.01] - - [4096, 1024, 1, 3512] - - [454, 9877.26] + - [549, 9877.26] - - [4096, 1024, 1, 3222] - - [460, 9859.77] + - [555, 9859.77] - - [1024, 4096, 1, 3165] - - [460, 9899.71] + - [555, 9899.71] - - [4096, 1024, 1, 3408] - - [460, 9899.68] + - [555, 9899.68] - - [4096, 1024, 1, 3751] - - [460, 9891.49] + - [555, 9891.49] - - [1024, 4096, 1, 3318] - - [453, 9913.42] + - [548, 9913.42] - - [4096, 1024, 1, 3442] - - [461, 9880.21] + - [556, 9880.21] - - [1024, 4096, 1, 3413] - - [460, 9921.9] + - [555, 9921.9] - - [4096, 1024, 1, 3524] - - [460, 9879.22] + - [555, 9879.22] - - [1024, 4096, 1, 3976] - - [461, 9945.57] + - [556, 9945.57] - - [1024, 4096, 1, 3475] - - [461, 9932.51] + - [556, 9932.51] - - [1024, 4096, 1, 3534] - - [453, 9911.49] + - [548, 9911.49] - - [4096, 1024, 1, 3301] - - [460, 9872.75] + - [555, 9872.75] - - [4096, 1024, 1, 3248] - - [460, 9878.22] + - [555, 9878.22] - - [1024, 4096, 1, 2977] - - [454, 9899.93] + - [549, 9899.93] - - [4096, 1024, 1, 3346] - - [460, 9876.07] + - [555, 9876.07] - - [1024, 4096, 1, 3451] - - [453, 9920.16] + - [548, 9920.16] - - [1024, 4096, 1, 3257] - - [454, 9905.02] + - [549, 9905.02] - - [1024, 1024, 1, 3640] - - [452, 8983.39] + - [547, 8983.39] - - [1024, 4096, 1, 3356] - - [453, 9904.48] + - [548, 9904.48] - - [4096, 1024, 1, 3348] - - [461, 9872.53] + - [556, 9872.53] - - [4096, 1024, 1, 3335] - - [460, 9865.82] + - [555, 9865.82] - - [4096, 1024, 1, 3505] - - [460, 9888.88] + - [555, 9888.88] - - [1024, 4096, 1, 3490] - - [453, 9938.0] + - [548, 9938.0] - - [4096, 1024, 1, 3447] - - [460, 9865.39] + - [555, 9865.39] - - [1024, 4096, 1, 3267] - - [461, 9919.32] + - [556, 9919.32] - - [4096, 1024, 1, 3230] - - [460, 9853.2] + - [555, 9853.2] - - [4096, 1024, 1, 3455] - - [460, 9862.44] + - [555, 9862.44] - - [1024, 4096, 1, 3925] - - [453, 9945.64] + - [548, 9945.64] - - [1024, 4096, 1, 3362] - - [454, 9921.63] + - [549, 9921.63] - - [4096, 1024, 1, 3969] - - [461, 9911.98] + - [556, 9911.98] - - [4096, 1024, 1, 3527] - - [460, 9882.87] + - [555, 9882.87] - - [1024, 4096, 1, 3585] - - [454, 9946.52] + - [549, 9946.52] - - [4096, 1024, 1, 3063] - - [460, 9854.03] + - [555, 9854.03] - - [4096, 1024, 1, 3435] - - [460, 9867.13] + - [555, 9867.13] - - [4096, 1024, 1, 3366] - - [461, 9864.02] + - [556, 9864.02] - - [4096, 1024, 1, 3581] - - [453, 9868.57] + - [548, 9868.57] - - [1024, 33708, 1, 3906] - - [453, 10339.3] + - [548, 10339.3] - - [1024, 4096, 1, 3464] - - [461, 9916.21] + - [556, 9916.21] - - [1024, 4096, 1, 3440] - - [460, 9945.25] + - [555, 9945.25] - - [4096, 1024, 1, 3143] - - [460, 9846.76] + - [555, 9846.76] - - [1024, 4096, 1, 3349] - - [454, 9912.83] + - [549, 9912.83] - - [4096, 1024, 1, 3416] - - [460, 9885.13] + - [555, 9885.13] - - [4096, 1024, 1, 3365] - - [460, 9876.0] + - [555, 9876.0] - - [1024, 4096, 1, 3470] - - [461, 9914.98] + - [556, 9914.98] - - [4096, 1024, 1, 3287] - - [460, 9860.69] + - [555, 9860.69] - - [1024, 4096, 1, 3441] - - [461, 9928.98] + - [556, 9928.98] - - [4096, 1024, 1, 3224] - - [460, 9857.83] + - [555, 9857.83] - - [1024, 4096, 1, 3387] - - [453, 9911.72] + - [548, 9911.72] - - [1024, 4096, 1, 3547] - - [453, 9920.36] + - [548, 9920.36] - - [4096, 1024, 1, 3478] - - [454, 9882.9] + - [549, 9882.9] - - [4096, 1024, 1, 3548] - - [461, 9869.45] + - [556, 9869.45] - - [1024, 33708, 1, 4020] - - [453, 10345.3] + - [548, 10345.3] - - [4096, 1024, 1, 3320] - - [460, 9863.74] + - [555, 9863.74] - - [1024, 4096, 1, 3906] - - [460, 9942.67] + - [555, 9942.67] - - [4096, 1024, 1, 3796] - - [460, 9899.13] + - [555, 9899.13] - - [1024, 4096, 1, 3306] - - [453, 9902.4] + - [548, 9902.4] - - [1024, 4096, 1, 3401] - - [461, 9913.95] + - [556, 9913.95] - - [64, 147, 432, 147] - - [444, 6626.6] + - [539, 6626.6] - - [1024, 4096, 1, 3215] - - [461, 9911.24] + - [556, 9911.24] - - [4096, 1024, 1, 4012] - - [461, 9898.2] + - [556, 9898.2] - - [1024, 4096, 1, 2765] - - [461, 9863.73] + - [556, 9863.73] - - [4096, 1024, 1, 3554] - - [454, 9883.52] + - [549, 9883.52] - - [4096, 1024, 1, 3423] - - [460, 9866.72] + - [555, 9866.72] - - [1024, 1024, 1, 3751] - - [458, 9006.36] + - [553, 9006.36] - - [1024, 4096, 1, 3562] - - [454, 9922.08] + - [549, 9922.08] - - [1024, 4096, 1, 3489] - - [453, 9936.78] + - [548, 9936.78] - - [4096, 1024, 1, 3358] - - [460, 9858.22] + - [555, 9858.22] - - [4096, 1024, 1, 3270] - - [461, 9850.84] + - [556, 9850.84] - - [1024, 4096, 1, 3293] - - [453, 9905.33] + - [548, 9905.33] - - [1024, 4096, 1, 3376] - - [453, 9934.98] + - [548, 9934.98] - - [4096, 1024, 1, 3245] - - [460, 9852.52] + - [555, 9852.52] - - [4096, 1024, 1, 3541] - - [460, 9887.22] + - [555, 9887.22] - - [4096, 1024, 1, 3443] - - [460, 9871.73] + - [555, 9871.73] - - [4096, 1024, 1, 3438] - - [461, 9863.86] + - [556, 9863.86] - - [4096, 1024, 1, 3244] - - [460, 9859.76] + - [555, 9859.76] - - [1024, 4096, 1, 3365] - - [460, 9922.1] + - [555, 9922.1] - - [1024, 4096, 1, 3299] - - [454, 9923.38] + - [549, 9923.38] - - [4096, 1024, 1, 3840] - - [460, 9914.75] + - [555, 9914.75] - - [1024, 4096, 1, 3471] - - [461, 9918.38] + - [556, 9918.38] - - [1024, 4096, 1, 3398] - - [453, 9918.99] + - [548, 9918.99] - - [4096, 1024, 1, 3162] - - [460, 9843.93] + - [555, 9843.93] - - [1024, 4096, 1, 4005] - - [454, 9947.87] + - [549, 9947.87] - - [4096, 1024, 1, 3579] - - [460, 9868.25] + - [555, 9868.25] - - [64, 18, 3440, 18] - - [436, 2059.33] + - [531, 2059.33] - - [64, 177, 352, 177] - - [455, 7315.4] + - [550, 7315.4] - - [1024, 4096, 1, 3121] - - [461, 9930.34] + - [556, 9930.34] - - [4096, 1024, 1, 3441] - - [460, 9883.28] + - [555, 9883.28] - - [4096, 1024, 1, 3422] - - [460, 9858.41] + - [555, 9858.41] - - [4096, 1024, 1, 3444] - - [460, 9887.03] + - [555, 9887.03] - - [1024, 4096, 1, 3337] - - [454, 9911.45] + - [549, 9911.45] - - [4096, 1024, 1, 3550] - - [453, 9871.87] + - [548, 9871.87] - - [1024, 4096, 1, 3477] - - [453, 9930.65] + - [548, 9930.65] - - [4096, 1024, 1, 3490] - - [460, 9878.45] + - [555, 9878.45] - - [4096, 1024, 1, 3585] - - [460, 9893.63] + - [555, 9893.63] - - [1024, 4096, 1, 3143] - - [453, 9901.19] + - [548, 9901.19] - - [1024, 33708, 1, 3876] - - [454, 10330.8] + - [549, 10330.8] - - [1024, 4096, 1, 3320] - - [461, 9913.18] + - [556, 9913.18] - - [1024, 4096, 1, 3423] - - [461, 9914.14] + - [556, 9914.14] - - [1024, 4096, 1, 3894] - - [453, 9944.47] + - [548, 9944.47] - - [4096, 1024, 1, 3410] - - [460, 9878.67] + - [555, 9878.67] - - [1024, 4096, 1, 3561] - - [453, 9926.68] + - [548, 9926.68] - - [4096, 1024, 1, 3492] - - [454, 9872.92] + - [549, 9872.92] - - [64, 85, 752, 85] - - [431, 5734.35] + - [526, 5734.35] - - [36548, 1024, 1, 3712] - - [463, 10367.6] + - [558, 10367.6] - - [4096, 2048, 1, 128] - - [464, 8743.93] + - [559, 8743.93] - - [1024, 1024, 1, 3712] - - [465, 9976.29] + - [560, 9976.29] - - [1024, 1024, 1, 128] - - [462, 5765.47] + - [557, 5765.47] - - [4096, 3072, 1, 128] - - [464, 8869.11] + - [559, 8869.11] - - [768, 3072, 1, 4096] - - [476, 10028.8] + - [571, 10028.8] - - [64, 256, 192, 256] - - [470, 8791.65] + - [565, 8791.65] - - [768, 2, 1, 16] - - [473, 5.05484] + - [568, 5.05484] - - [768, 768, 1, 64] - - [469, 3469.65] + - [564, 3469.65] - - [768, 768, 1, 4096] - - [477, 7475.1] + - [572, 7475.1] - - [768, 30522, 1, 1280] - - [480, 10297.0] + - [575, 10297.0] - - [64, 128, 384, 128] - - [470, 7660.93] + - [565, 7660.93] - - [768, 30522, 1, 320] - - [478, 10008.0] + - [573, 10008.0] - - [768, 768, 1, 32] - - [467, 2359.4] + - [562, 2359.4] - - [3072, 768, 1, 4096] - - [476, 10033.8] + - [571, 10033.8] - - [768, 30522, 1, 640] - - [479, 10206.8] + - [574, 10206.8] - - [64, 64, 768, 64] - - [468, 5494.82] + - [563, 5494.82] - - [768, 768, 1, 640] - - [477, 6721.74] + - [572, 6721.74] - - [768, 768, 1, 16] - - [466, 1203.82] + - [561, 1203.82] - - [768, 768, 1, 1280] - - [475, 7138.67] + - [570, 7138.67] - - [768, 2, 1, 32] - - [471, 11.9154] + - [566, 11.9154] - - [2048, 2048, 1, 512] - - [491, 9607.67] + - [586, 9607.67] - - [512, 32, 1, 200] - - [484, 422.368] + - [579, 422.368] - - [1024, 1, 1, 200] - - [487, 24.7154] + - [582, 24.7154] - - [1600, 1024, 1, 512] - - [482, 8116.01] + - [577, 8116.01] - - [560, 1024, 1, 200] - - [481, 4810.84] + - [576, 4810.84] - - [1024, 1024, 1, 512] - - [490, 8614.84] + - [585, 8614.84] - - [2048, 1, 1, 512] - - [485, 81.0086] + - [580, 81.0086] - - [512, 512, 1, 200] - - [483, 4398.49] + - [578, 4398.49] - - [100, 2048, 1, 512] - - [488, 4443.22] + - [583, 4443.22] - - [1024, 1024, 1, 200] - - [489, 6990.61] + - [584, 6990.61] - - [1024, 64, 1, 512] - - [486, 2853.37] + - [581, 2853.37] - - [1024, 256, 1, 18944] - - [510, 9196.51] + - [605, 9196.51] - - [256, 3328, 1, 8976] - - [500, 8299.36] + - [595, 8299.36] - - [1024, 256, 1, 4352] - - [508, 8813.84] + - [603, 8813.84] - - [256, 9728, 1, 8976] - - [503, 9638.58] + - [598, 9638.58] - - [1024, 256, 1, 3072] - - [510, 8640.73] + - [605, 8640.73] - - [768, 2048, 1, 256] - - [502, 8663.03] + - [597, 8663.03] - - [1024, 256, 1, 19968] - - [507, 9220.96] + - [602, 9220.96] - - [256, 12800, 1, 8976] - - [497, 9418.52] + - [592, 9418.52] - - [1024, 256, 1, 3328] - - [511, 8682.58] + - [606, 8682.58] - - [256, 10240, 1, 8976] - - [504, 10137.8] + - [599, 10137.8] - - [1024, 256, 1, 15104] - - [509, 9167.13] + - [604, 9167.13] - - [256, 10496, 1, 8976] - - [497, 9858.48] + - [592, 9858.48] - - [1024, 256, 1, 2816] - - [512, 8575.81] + - [607, 8575.81] - - [1024, 256, 1, 4608] - - [507, 8861.31] + - [602, 8861.31] - - [256, 11264, 1, 8976] - - [494, 9627.79] + - [589, 9627.79] - - [1024, 256, 1, 6400] - - [507, 8985.33] + - [602, 8985.33] - - [1024, 256, 1, 16128] - - [507, 9170.36] + - [602, 9170.36] - - [256, 44505, 1, 8976] - - [501, 10331.9] + - [596, 10331.9] - - [256, 6144, 1, 8976] - - [504, 10395.1] + - [599, 10395.1] - - [1024, 256, 1, 5120] - - [509, 8881.63] + - [604, 8881.63] - - [1024, 256, 1, 7936] - - [512, 9023.24] + - [607, 9023.24] - - [256, 3840, 1, 8976] - - [499, 9541.38] + - [594, 9541.38] - - [1024, 256, 1, 21248] - - [507, 9209.82] + - [602, 9209.82] - - [1024, 256, 1, 12032] - - [509, 9156.27] + - [604, 9156.27] - - [256, 8192, 1, 8976] - - [506, 10374.5] + - [601, 10374.5] - - [1024, 256, 1, 3584] - - [508, 8712.3] + - [603, 8712.3] - - [1024, 256, 1, 14336] - - [509, 9162.61] + - [604, 9162.61] - - [256, 7168, 1, 8976] - - [495, 9554.96] + - [590, 9554.96] - - [1024, 256, 1, 13568] - - [507, 9165.14] + - [602, 9165.14] - - [256, 4096, 1, 8976] - - [499, 10146.7] + - [594, 10146.7] - - [1024, 256, 1, 4096] - - [508, 8783.98] + - [603, 8783.98] - - [256, 2560, 1, 8976] - - [498, 8381.66] + - [593, 8381.66] - - [256, 20992, 1, 8976] - - [497, 9989.96] + - [592, 9989.96] - - [256, 4352, 1, 8976] - - [498, 9635.02] + - [593, 9635.02] - - [256, 33536, 1, 8976] - - [497, 10218.2] + - [592, 10218.2] - - [256, 3584, 1, 8976] - - [499, 8924.6] + - [594, 8924.6] - - [256, 26112, 1, 8976] - - [498, 10272.4] + - [593, 10272.4] - - [256, 14336, 1, 8976] - - [502, 10217.4] + - [597, 10217.4] - - [1024, 256, 1, 14848] - - [509, 9185.29] + - [604, 9185.29] - - [1024, 256, 1, 8448] - - [510, 9025.99] + - [605, 9025.99] - - [1024, 256, 1, 28672] - - [507, 9256.5] + - [602, 9256.5] - - [1024, 256, 1, 5632] - - [507, 8932.79] + - [602, 8932.79] - - [256, 22016, 1, 8976] - - [502, 10152.0] + - [597, 10152.0] - - [1024, 256, 1, 33536] - - [507, 9243.17] + - [602, 9243.17] - - [256, 5120, 1, 8976] - - [493, 9418.15] + - [588, 9418.15] - - [256, 11520, 1, 8976] - - [500, 9701.1] + - [595, 9701.1] - - [256, 19968, 1, 8976] - - [498, 10228.1] + - [593, 10228.1] - - [1024, 256, 1, 5376] - - [509, 8892.62] + - [604, 8892.62] - - [1024, 256, 1, 22016] - - [507, 9244.34] + - [602, 9244.34] - - [256, 8960, 1, 8976] - - [498, 9841.41] + - [593, 9841.41] - - [1024, 256, 1, 15872] - - [507, 9223.25] + - [602, 9223.25] - - [256, 17408, 1, 8976] - - [502, 9785.87] + - [597, 9785.87] - - [256, 5632, 1, 8976] - - [502, 9564.32] + - [597, 9564.32] - - [256, 32512, 1, 8976] - - [501, 10358.0] + - [596, 10358.0] - - [256, 11008, 1, 8976] - - [494, 9445.23] + - [589, 9445.23] - - [1024, 256, 1, 6144] - - [509, 8955.91] + - [604, 8955.91] - - [256, 4864, 1, 8976] - - [494, 8979.45] + - [589, 8979.45] - - [256, 15104, 1, 8976] - - [497, 10007.1] + - [592, 10007.1] - - [1024, 256, 1, 9984] - - [507, 9110.53] + - [602, 9110.53] - - [256, 1280, 1, 8976] - - [493, 5944.44] + - [588, 5944.44] - - [1024, 256, 1, 1024] - - [509, 7005.2] + - [604, 7005.2] - - [1024, 256, 1, 9728] - - [509, 9066.29] + - [604, 9066.29] - - [1024, 256, 1, 10496] - - [507, 9118.15] + - [602, 9118.15] - - [256, 11776, 1, 8976] - - [504, 9911.74] + - [599, 9911.74] - - [256, 12544, 1, 8976] - - [497, 9235.35] + - [592, 9235.35] - - [1024, 256, 1, 17152] - - [507, 9152.31] + - [602, 9152.31] - - [1024, 256, 1, 11520] - - [509, 9146.87] + - [604, 9146.87] - - [1024, 256, 1, 21504] - - [509, 9207.52] + - [604, 9207.52] - - [256, 17152, 1, 8976] - - [496, 9654.81] + - [591, 9654.81] - - [1024, 256, 1, 17408] - - [507, 9181.27] + - [602, 9181.27] - - [256, 15872, 1, 8976] - - [505, 10086.5] + - [600, 10086.5] - - [256, 18688, 1, 8976] - - [498, 9612.57] + - [593, 9612.57] - - [256, 5888, 1, 8976] - - [502, 9988.43] + - [597, 9988.43] - - [512, 2048, 1, 256] - - [492, 7678.46] + - [587, 7678.46] - - [1024, 256, 1, 7680] - - [510, 9033.06] + - [605, 9033.06] - - [1024, 256, 1, 1280] - - [512, 7767.33] + - [607, 7767.33] - - [256, 14848, 1, 8976] - - [498, 9852.76] + - [593, 9852.76] - - [256, 9984, 1, 8976] - - [504, 9908.97] + - [599, 9908.97] - - [256, 20480, 1, 8976] - - [502, 10337.2] + - [597, 10337.2] - - [1024, 256, 1, 8192] - - [509, 9044.42] + - [604, 9044.42] - - [1024, 256, 1, 19712] - - [508, 9184.28] + - [603, 9184.28] - - [256, 13568, 1, 8976] - - [498, 9927.92] + - [593, 9927.92] - - [256, 13312, 1, 8976] - - [497, 9758.01] + - [592, 9758.01] - - [256, 2816, 1, 8976] - - [497, 9191.53] + - [592, 9191.53] - - [1024, 256, 1, 2304] - - [508, 8445.01] + - [603, 8445.01] - - [256, 21248, 1, 8976] - - [498, 10127.6] + - [593, 10127.6] - - [256, 16128, 1, 8976] - - [506, 10238.5] + - [601, 10238.5] - - [256, 512, 36, 98] - - [529, 7994.95] + - [624, 7994.95] - - [64, 192, 36, 25088] - - [598, 8613.99] + - [693, 8613.99] - - [128, 128, 64, 25] - - [528, 2540.25] + - [623, 2540.25] - - [256, 256, 64, 56] - - [529, 6924.66] + - [624, 6924.66] - - [512, 486, 36, 800] - - [536, 8994.94] + - [631, 8994.94] - - [512, 512, 36, 1568] - - [547, 9872.48] + - [642, 9872.48] - - [64, 192, 64, 3200] - - [592, 9295.99] + - [687, 9295.99] - - [256, 384, 36, 4096] - - [592, 9334.71] + - [687, 9334.71] - - [128, 256, 64, 32] - - [531, 4280.0] + - [626, 4280.0] - - [64, 128, 64, 23104] - - [598, 10103.2] + - [693, 10103.2] - - [128, 256, 64, 9] - - [522, 1709.73] + - [617, 1709.73] - - [256, 512, 36, 784] - - [532, 9520.83] + - [627, 9520.83] - - [256, 324, 36, 32] - - [570, 4473.48] + - [665, 4473.48] - - [512, 512, 36, 33] - - [541, 5925.27] + - [636, 5925.27] - - [16, 32, 36, 5760] - - [545, 1448.9] + - [640, 1448.9] - - [192, 384, 64, 128] - - [592, 8618.53] + - [687, 8618.53] - - [512, 512, 64, 72] - - [548, 8260.22] + - [643, 8260.22] - - [128, 128, 64, 1600] - - [521, 9008.48] + - [616, 9008.48] - - [512, 512, 36, 128] - - [592, 8871.72] + - [687, 8871.72] - - [192, 384, 64, 2304] - - [521, 9657.26] + - [616, 9657.26] - - [384, 256, 64, 450] - - [557, 9539.03] + - [652, 9539.03] - - [3, 64, 36, 6272] - - [545, 509.884] + - [640, 509.884] - - [3, 64, 64, 2888] - - [574, 708.721] + - [669, 708.721] - - [384, 256, 64, 2304] - - [557, 10287.6] + - [652, 10287.6] - - [512, 512, 64, 144] - - [592, 9226.8] + - [687, 9226.8] - - [256, 256, 36, 6272] - - [532, 9607.38] + - [627, 9607.38] - - [80, 192, 64, 4608] - - [593, 7348.03] + - [688, 7348.03] - - [64, 64, 36, 3136] - - [580, 5959.15] + - [675, 5959.15] - - [256, 384, 64, 2304] - - [557, 10283.5] + - [652, 10283.5] - - [512, 512, 36, 66] - - [541, 7618.18] + - [636, 7618.18] - - [128, 256, 64, 800] - - [567, 9611.25] + - [662, 9611.25] - - [64, 128, 36, 30] - - [523, 1242.71] + - [618, 1242.71] - - [192, 256, 36, 512] - - [592, 8658.07] + - [687, 8658.07] - - [256, 512, 64, 200] - - [592, 9153.97] + - [687, 9153.97] - - [256, 512, 64, 25] - - [570, 5349.98] + - [665, 5349.98] - - [3, 64, 64, 46208] - - [573, 808.662] + - [668, 808.662] - - [128, 256, 36, 1568] - - [565, 8528.72] + - [660, 8528.72] - - [64, 128, 64, 11552] - - [598, 9997.1] + - [693, 9997.1] - - [128, 192, 64, 946] - - [592, 9198.48] + - [687, 9198.48] - - [64, 192, 64, 12800] - - [553, 9000.76] + - [648, 9000.76] - - [224, 224, 64, 128] - - [530, 6312.17] + - [625, 6312.17] - - [128, 256, 64, 288] - - [592, 8697.97] + - [687, 8697.97] - - [64, 64, 64, 826] - - [535, 6650.31] + - [630, 6650.31] - - [256, 384, 64, 1152] - - [567, 10106.9] + - [662, 10106.9] - - [3, 64, 64, 92416] - - [573, 812.131] + - [668, 812.131] - - [32, 32, 36, 43808] - - [514, 2813.19] + - [609, 2813.19] - - [160, 320, 64, 288] - - [524, 8090.96] + - [619, 8090.96] - - [1, 16, 36, 23040] - - [561, 42.7667] + - [656, 42.7667] - - [128, 256, 36, 128] - - [539, 6049.58] + - [634, 6049.58] - - [128, 128, 64, 3360] - - [592, 9200.06] + - [687, 9200.06] - - [128, 128, 64, 420] - - [592, 8131.6] + - [687, 8131.6] - - [64, 128, 64, 361] - - [529, 6938.08] + - [624, 6938.08] - - [512, 512, 36, 16] - - [585, 3797.76] + - [680, 3797.76] - - [384, 256, 36, 800] - - [526, 9151.75] + - [621, 9151.75] - - [192, 384, 36, 4096] - - [526, 8867.67] + - [621, 8867.67] - - [64, 64, 64, 1600] - - [578, 7931.84] + - [673, 7931.84] - - [256, 384, 64, 576] - - [558, 9745.9] + - [653, 9745.9] - - [512, 512, 64, 14] - - [541, 3638.28] + - [636, 3638.28] - - [512, 512, 36, 8] - - [516, 2279.61] + - [611, 2279.61] - - [512, 486, 64, 128] - - [532, 8337.93] + - [627, 8337.93] - - [1, 16, 64, 640] - - [566, 50.0512] + - [661, 50.0512] - - [64, 96, 64, 288] - - [591, 5708.07] + - [686, 5708.07] - - [96, 96, 36, 1568] - - [560, 6866.85] + - [655, 6866.85] - - [256, 256, 36, 128] - - [564, 7703.92] + - [659, 7703.92] - - [64, 128, 36, 53824] - - [552, 6331.41] + - [647, 6331.41] - - [256, 256, 36, 32] - - [548, 4648.96] + - [643, 4648.96] - - [192, 256, 64, 288] - - [592, 8987.89] + - [687, 8987.89] - - [256, 256, 36, 16] - - [562, 2912.81] + - [657, 2912.81] - - [128, 256, 36, 3200] - - [565, 8680.37] + - [660, 8680.37] - - [160, 320, 64, 512] - - [524, 8449.54] + - [619, 8449.54] - - [128, 160, 36, 512] - - [535, 7215.07] + - [630, 7215.07] - - [96, 96, 36, 2592] - - [530, 7104.89] + - [625, 7104.89] - - [64, 96, 64, 800] - - [560, 7268.42] + - [655, 7268.42] - - [147, 64, 36, 18816] - - [576, 7116.36] + - [671, 7116.36] - - [160, 320, 36, 512] - - [530, 7874.92] + - [625, 7874.92] - - [256, 512, 36, 4] - - [569, 1034.88] + - [664, 1034.88] - - [96, 128, 64, 946] - - [552, 7901.17] + - [647, 7901.17] - - [256, 324, 64, 1568] - - [557, 8589.63] + - [652, 8589.63] - - [128, 128, 64, 50] - - [548, 4070.66] + - [643, 4070.66] - - [35, 96, 36, 8960] - - [542, 4207.4] + - [637, 4207.4] - - [32, 64, 36, 43808] - - [583, 4390.91] + - [678, 4390.91] - - [160, 224, 36, 128] - - [530, 5447.02] + - [625, 5447.02] - - [64, 64, 64, 81] - - [555, 2391.28] + - [650, 2391.28] - - [256, 256, 36, 3200] - - [521, 9559.65] + - [616, 9559.65] - - [256, 256, 36, 210] - - [532, 8414.71] + - [627, 8414.71] - - [192, 384, 64, 576] - - [592, 9468.85] + - [687, 9468.85] - - [512, 512, 64, 800] - - [567, 10096.5] + - [662, 10096.5] - - [512, 24, 36, 800] - - [518, 4761.87] + - [613, 4761.87] - - [64, 64, 64, 13216] - - [579, 8491.51] + - [674, 8491.51] - - [192, 224, 64, 1152] - - [535, 8769.16] + - [630, 8769.16] - - [256, 256, 64, 1152] - - [557, 9988.19] + - [652, 9988.19] - - [512, 486, 64, 512] - - [567, 9254.77] + - [662, 9254.77] - - [128, 128, 36, 784] - - [530, 7468.16] + - [625, 7468.16] - - [256, 512, 64, 1600] - - [554, 10232.6] + - [649, 10232.6] - - [512, 512, 64, 9] - - [548, 2599.88] + - [643, 2599.88] - - [96, 128, 64, 288] - - [560, 6599.53] + - [655, 6599.53] - - [64, 96, 36, 512] - - [560, 5073.85] + - [655, 5073.85] - - [256, 512, 36, 1568] - - [592, 9637.91] + - [687, 9637.91] - - [128, 128, 64, 400] - - [592, 8192.1] + - [687, 8192.1] - - [128, 128, 64, 800] - - [592, 8716.44] + - [687, 8716.44] - - [96, 128, 36, 512] - - [580, 6757.03] + - [675, 6757.03] - - [16, 32, 36, 360] - - [543, 754.136] + - [638, 754.136] - - [128, 256, 64, 3200] - - [557, 10222.6] + - [652, 10222.6] - - [96, 128, 64, 800] - - [560, 7968.0] + - [655, 7968.0] - - [256, 512, 64, 4] - - [522, 1098.09] + - [617, 1098.09] - - [256, 256, 64, 450] - - [567, 9347.55] + - [662, 9347.55] - - [64, 64, 64, 3200] - - [578, 8518.18] + - [673, 8518.18] - - [192, 224, 64, 128] - - [538, 7035.27] + - [633, 7035.27] - - [128, 128, 64, 288] - - [592, 7751.38] + - [687, 7751.38] - - [256, 256, 64, 72] - - [548, 7489.93] + - [643, 7489.93] - - [96, 208, 36, 512] - - [560, 6939.21] + - [655, 6939.21] - - [128, 256, 36, 3136] - - [535, 8669.43] + - [630, 8669.43] - - [64, 64, 36, 3520] - - [530, 6007.57] + - [625, 6007.57] - - [64, 128, 36, 1568] - - [593, 6897.8] + - [688, 6897.8] - - [160, 320, 64, 242] - - [519, 7873.27] + - [614, 7873.27] - - [192, 192, 36, 512] - - [530, 7707.42] + - [625, 7707.42] - - [512, 512, 36, 512] - - [592, 9582.52] + - [687, 9582.52] - - [1, 16, 64, 10240] - - [544, 71.4511] + - [639, 71.4511] - - [128, 128, 36, 512] - - [530, 7149.48] + - [625, 7149.48] - - [512, 512, 36, 256] - - [521, 9384.5] + - [616, 9384.5] - - [512, 512, 36, 1024] - - [515, 9777.99] + - [610, 9777.99] - - [96, 208, 64, 1152] - - [593, 7851.0] + - [688, 7851.0] - - [128, 192, 64, 3200] - - [521, 9490.92] + - [616, 9490.92] - - [256, 256, 36, 4096] - - [526, 9585.56] + - [621, 9585.56] - - [160, 160, 64, 288] - - [560, 7299.9] + - [655, 7299.9] - - [256, 256, 64, 896] - - [557, 9850.43] + - [652, 9850.43] - - [128, 256, 64, 242] - - [592, 8391.48] + - [687, 8391.48] - - [128, 128, 36, 440] - - [535, 6274.82] + - [630, 6274.82] - - [96, 128, 36, 1568] - - [580, 7875.13] + - [675, 7875.13] - - [192, 384, 36, 1024] - - [526, 8715.82] + - [621, 8715.82] - - [64, 96, 36, 10368] - - [597, 7478.69] + - [692, 7478.69] - - [128, 256, 64, 100] - - [541, 7085.07] + - [636, 7085.07] - - [112, 224, 36, 2048] - - [534, 7556.02] + - [629, 7556.02] - - [384, 256, 64, 1152] - - [557, 10102.4] + - [652, 10102.4] - - [192, 384, 36, 128] - - [592, 7543.14] + - [687, 7543.14] - - [128, 128, 36, 7040] - - [565, 7600.7] + - [660, 7600.7] - - [128, 256, 64, 1568] - - [557, 10006.0] + - [652, 10006.0] - - [128, 128, 36, 1568] - - [549, 7848.4] + - [644, 7848.4] - - [128, 256, 64, 72] - - [572, 6553.7] + - [667, 6553.7] - - [256, 256, 36, 12544] - - [586, 9365.14] + - [681, 9365.14] - - [256, 256, 36, 105] - - [548, 7286.16] + - [643, 7286.16] - - [128, 256, 36, 392] - - [535, 7625.79] + - [630, 7625.79] - - [64, 64, 64, 5408] - - [578, 8882.77] + - [673, 8882.77] - - [3, 64, 36, 25088] - - [545, 529.042] + - [640, 529.042] - - [384, 256, 36, 1024] - - [592, 9182.85] + - [687, 9182.85] - - [35, 96, 36, 13440] - - [599, 4110.39] + - [694, 4110.39] - - [128, 256, 64, 1152] - - [557, 9804.97] + - [652, 9804.97] - - [256, 324, 64, 32] - - [570, 5043.73] + - [665, 5043.73] - - [160, 224, 64, 128] - - [584, 6046.25] + - [679, 6046.25] - - [192, 224, 36, 2592] - - [582, 8878.78] + - [677, 8878.78] - - [96, 96, 64, 1152] - - [560, 8035.55] + - [655, 8035.55] - - [32, 64, 36, 90] - - [517, 964.565] + - [612, 964.565] - - [64, 128, 64, 2888] - - [532, 9047.33] + - [627, 9047.33] - - [256, 384, 36, 800] - - [592, 9154.12] + - [687, 9154.12] - - [512, 512, 64, 4] - - [589, 1233.72] + - [684, 1233.72] - - [192, 320, 36, 128] - - [529, 7388.29] + - [624, 7388.29] - - [64, 128, 36, 480] - - [593, 5653.37] + - [688, 5653.37] - - [192, 384, 64, 242] - - [592, 9080.09] + - [687, 9080.09] - - [256, 486, 64, 32] - - [585, 5909.28] + - [680, 5909.28] - - [147, 64, 64, 9702] - - [594, 7319.79] + - [689, 7319.79] - - [512, 512, 64, 64] - - [528, 8179.12] + - [623, 8179.12] - - [64, 192, 64, 3698] - - [521, 9287.99] + - [616, 9287.99] - - [73, 192, 64, 10439] - - [552, 6668.12] + - [647, 6668.12] - - [1, 16, 36, 1440] - - [568, 33.5452] + - [663, 33.5452] - - [128, 256, 36, 512] - - [535, 7989.25] + - [630, 7989.25] - - [512, 512, 64, 576] - - [567, 9951.99] + - [662, 9951.99] - - [64, 64, 36, 12544] - - [583, 5872.87] + - [678, 5872.87] - - [128, 128, 36, 880] - - [580, 7597.36] + - [675, 7597.36] - - [192, 224, 36, 128] - - [538, 6451.3] + - [633, 6451.3] - - [64, 64, 64, 800] - - [578, 6916.83] + - [673, 6916.83] - - [64, 128, 36, 12544] - - [556, 6395.98] + - [651, 6395.98] - - [64, 64, 36, 1568] - - [530, 5536.76] + - [625, 5536.76] - - [160, 160, 36, 512] - - [530, 7345.36] + - [625, 7345.36] - - [512, 24, 64, 512] - - [520, 5242.98] + - [615, 5242.98] - - [3, 64, 36, 3136] - - [545, 475.452] + - [640, 475.452] - - [256, 256, 64, 9] - - [570, 2106.61] + - [665, 2106.61] - - [3, 64, 64, 11552] - - [573, 785.227] + - [668, 785.227] - - [128, 256, 36, 12544] - - [588, 8792.23] + - [683, 8792.23] - - [128, 128, 36, 3136] - - [549, 8098.56] + - [644, 8098.56] - - [256, 512, 36, 3136] - - [532, 9694.49] + - [627, 9694.49] - - [64, 64, 36, 196] - - [546, 2757.86] + - [641, 2757.86] - - [144, 288, 36, 512] - - [580, 7077.99] + - [675, 7077.99] - - [256, 24, 64, 32] - - [559, 1483.93] + - [654, 1483.93] - - [384, 384, 36, 800] - - [521, 9246.6] + - [616, 9246.6] - - [512, 512, 64, 1600] - - [567, 10277.4] + - [662, 10277.4] - - [112, 224, 36, 512] - - [535, 6744.88] + - [630, 6744.88] - - [128, 128, 36, 49] - - [541, 2716.39] + - [636, 2716.39] - - [512, 512, 36, 4] - - [569, 1156.62] + - [664, 1156.62] - - [35, 96, 64, 4235] - - [530, 4631.38] + - [625, 4631.38] - - [192, 384, 64, 450] - - [521, 9372.3] + - [616, 9372.3] - - [256, 256, 36, 1024] - - [592, 9346.74] + - [687, 9346.74] - - [112, 224, 64, 1152] - - [535, 7524.05] + - [630, 7524.05] - - [256, 512, 64, 400] - - [554, 9598.05] + - [649, 9598.05] - - [149, 32, 36, 19072] - - [599, 5811.9] + - [694, 5811.9] - - [128, 256, 36, 6272] - - [535, 8754.78] + - [630, 8754.78] - - [128, 192, 36, 1568] - - [560, 8195.2] + - [655, 8195.2] - - [256, 256, 36, 512] - - [592, 9074.32] + - [687, 9074.32] - - [256, 256, 64, 112] - - [592, 8305.65] + - [687, 8305.65] - - [512, 512, 64, 18] - - [585, 4324.12] + - [680, 4324.12] - - [256, 256, 64, 18] - - [548, 3547.91] + - [643, 3547.91] - - [256, 256, 64, 1568] - - [557, 10141.8] + - [652, 10141.8] - - [64, 96, 36, 1568] - - [578, 6805.76] + - [673, 6805.76] - - [384, 256, 36, 4096] - - [592, 9311.2] + - [687, 9311.2] - - [256, 512, 64, 800] - - [567, 9998.45] + - [662, 9998.45] - - [256, 384, 36, 2048] - - [592, 9285.44] + - [687, 9285.44] - - [3, 64, 36, 200704] - - [574, 547.475] + - [669, 547.475] - - [384, 384, 64, 2304] - - [515, 9901.78] + - [610, 9901.78] - - [160, 320, 64, 128] - - [551, 7113.91] + - [646, 7113.91] - - [512, 512, 36, 528] - - [521, 9567.75] + - [616, 9567.75] - - [160, 320, 36, 128] - - [552, 6411.23] + - [647, 6411.23] - - [96, 96, 64, 800] - - [560, 7690.11] + - [655, 7690.11] - - [256, 512, 36, 49] - - [548, 6721.35] + - [643, 6721.35] - - [384, 384, 64, 450] - - [521, 9523.63] + - [616, 9523.63] - - [3, 64, 64, 23104] - - [573, 801.721] + - [668, 801.721] - - [256, 256, 64, 3200] - - [557, 10300.5] + - [652, 10300.5] - - [128, 192, 36, 512] - - [535, 7499.85] + - [630, 7499.85] - - [192, 192, 64, 288] - - [592, 8774.34] + - [687, 8774.34] - - [96, 208, 64, 242] - - [552, 5902.09] + - [647, 5902.09] - - [256, 16, 36, 3200] - - [581, 3807.87] + - [676, 3807.87] - - [512, 512, 64, 8] - - [559, 2379.85] + - [654, 2379.85] - - [64, 128, 64, 5776] - - [532, 9332.84] + - [627, 9332.84] - - [512, 512, 64, 288] - - [521, 9522.09] + - [616, 9522.09] - - [256, 16, 36, 32] - - [577, 766.105] + - [672, 766.105] - - [128, 192, 64, 288] - - [592, 8527.68] + - [687, 8527.68] - - [32, 64, 64, 640] - - [560, 4660.44] + - [655, 4660.44] - - [64, 64, 36, 392] - - [560, 3686.5] + - [655, 3686.5] - - [384, 384, 36, 1024] - - [526, 9282.58] + - [621, 9282.58] - - [64, 64, 36, 11552] - - [590, 5904.88] + - [685, 5904.88] - - [96, 128, 36, 6272] - - [580, 8351.09] + - [675, 8351.09] - - [128, 256, 36, 16] - - [562, 2144.91] + - [657, 2144.91] - - [256, 256, 64, 288] - - [592, 9140.23] + - [687, 9140.23] - - [64, 64, 64, 1652] - - [578, 7766.63] + - [673, 7766.63] - - [256, 384, 36, 1024] - - [526, 9203.37] + - [621, 9203.37] - - [96, 128, 64, 3200] - - [595, 8866.3] + - [690, 8866.3] - - [256, 324, 36, 3200] - - [534, 8194.35] + - [629, 8194.35] - - [128, 192, 64, 800] - - [592, 9198.13] + - [687, 9198.13] - - [64, 128, 64, 10] - - [533, 851.217] + - [628, 851.217] - - [96, 208, 64, 288] - - [560, 6667.68] + - [655, 6667.68] - - [64, 96, 36, 2592] - - [542, 7216.98] + - [637, 7216.98] - - [64, 128, 64, 160] - - [571, 5191.07] + - [666, 5191.07] - - [192, 384, 64, 512] - - [521, 9446.14] + - [616, 9446.14] - - [64, 64, 36, 6272] - - [530, 6212.11] + - [625, 6212.11] - - [512, 24, 36, 288] - - [527, 3922.57] + - [622, 3922.57] - - [128, 128, 64, 1568] - - [521, 9037.96] + - [616, 9037.96] - - [112, 224, 64, 242] - - [591, 6399.36] + - [686, 6399.36] - - [128, 256, 64, 1600] - - [557, 10010.4] + - [652, 10010.4] - - [32, 32, 64, 20000] - - [525, 4378.51] + - [620, 4378.51] - - [160, 192, 64, 288] - - [552, 7803.73] + - [647, 7803.73] - - [512, 24, 64, 128] - - [513, 3733.9] + - [608, 3733.9] - - [512, 512, 36, 32] - - [548, 5935.44] + - [643, 5935.44] - - [3, 64, 36, 100352] - - [545, 542.883] + - [640, 542.883] - - [3, 64, 64, 1444] - - [574, 674.259] + - [669, 674.259] - - [512, 512, 36, 3136] - - [515, 9921.2] + - [610, 9921.2] - - [128, 256, 64, 6400] - - [575, 10349.4] + - [670, 10349.4] - - [256, 256, 36, 2048] - - [592, 9519.09] + - [687, 9519.09] - - [128, 160, 64, 288] - - [535, 7549.85] + - [630, 7549.85] - - [256, 256, 64, 6400] - - [557, 10392.7] + - [652, 10392.7] - - [32, 64, 64, 20000] - - [583, 6493.96] + - [678, 6493.96] - - [256, 256, 36, 1680] - - [532, 9513.39] + - [627, 9513.39] - - [128, 128, 64, 210] - - [592, 7094.2] + - [687, 7094.2] - - [192, 384, 36, 2048] - - [521, 8818.75] + - [616, 8818.75] - - [256, 256, 64, 144] - - [592, 8608.71] + - [687, 8608.71] - - [384, 384, 36, 4096] - - [526, 9357.04] + - [621, 9357.04] - - [160, 320, 64, 1152] - - [552, 8749.58] + - [647, 8749.58] - - [384, 256, 36, 2048] - - [592, 9279.73] + - [687, 9279.73] - - [256, 512, 36, 392] - - [592, 9252.24] + - [687, 9252.24] - - [256, 512, 64, 50] - - [548, 7511.39] + - [643, 7511.39] - - [73, 192, 36, 23360] - - [596, 5803.03] + - [691, 5803.03] - - [3, 64, 36, 50176] - - [545, 542.137] + - [640, 542.137] - - [384, 384, 36, 2048] - - [521, 9325.9] + - [616, 9325.9] - - [256, 384, 64, 450] - - [567, 9528.76] + - [662, 9528.76] - - [192, 320, 64, 128] - - [526, 8399.91] + - [621, 8399.91] - - [128, 256, 36, 32] - - [541, 3276.9] + - [636, 3276.9] - - [160, 192, 36, 512] - - [580, 7752.44] + - [675, 7752.44] - - [512, 512, 64, 256] - - [532, 9473.74] + - [627, 9473.74] - - [256, 512, 64, 32] - - [570, 6391.42] + - [665, 6391.42] - - [384, 384, 64, 576] - - [521, 9614.89] + - [616, 9614.89] - - [64, 64, 64, 648] - - [578, 6282.25] + - [673, 6282.25] - - [512, 486, 36, 288] - - [592, 8625.03] + - [687, 8625.03] - - [32, 64, 36, 1440] - - [530, 3961.6] + - [625, 3961.6] - - [144, 288, 64, 242] - - [552, 6347.12] + - [647, 6347.12] - - [384, 256, 64, 576] - - [557, 9775.34] + - [652, 9775.34] - - [512, 512, 36, 64] - - [528, 7791.38] + - [623, 7791.38] - - [448, 384, 64, 128] - - [521, 9132.33] + - [616, 9132.33] - - [64, 128, 64, 722] - - [571, 8047.21] + - [666, 8047.21] - - [144, 288, 64, 288] - - [580, 6859.5] + - [675, 6859.5] - - [512, 512, 64, 224] - - [592, 9427.39] + - [687, 9427.39] - - [112, 224, 64, 288] - - [591, 6737.02] + - [686, 6737.02] - - [384, 384, 64, 1152] - - [515, 9820.56] + - [610, 9820.56] - - [448, 384, 36, 128] - - [592, 8761.41] + - [687, 8761.41] - - [64, 64, 64, 100] - - [538, 2708.2] + - [633, 2708.2] - - [256, 486, 36, 128] - - [564, 7640.14] + - [659, 7640.14] - - [64, 96, 64, 4608] - - [593, 8351.59] + - [688, 8351.59] - - [16, 32, 64, 160] - - [517, 736.46] + - [612, 736.46] - - [64, 192, 36, 6272] - - [593, 8041.29] + - [688, 8041.29] - - [64, 64, 64, 200] - - [546, 3924.41] + - [641, 3924.41] - - [256, 256, 36, 800] - - [592, 9299.65] + - [687, 9299.65] - - [64, 128, 36, 6272] - - [590, 6816.46] + - [685, 6816.46] - - [32, 64, 64, 40] - - [537, 885.722] + - [632, 885.722] - - [256, 16, 64, 32] - - [587, 1205.36] + - [682, 1205.36] - - [192, 384, 36, 800] - - [526, 8673.98] + - [621, 8673.98] - - [128, 128, 36, 3200] - - [560, 8538.99] + - [655, 8538.99] - - [256, 256, 36, 256] - - [532, 8454.46] + - [627, 8454.46] - - [192, 384, 64, 1152] - - [521, 9589.11] + - [616, 9589.11] - - [128, 256, 64, 200] - - [531, 8141.22] + - [626, 8141.22] - - [64, 96, 64, 1152] - - [560, 7620.98] + - [655, 7620.98] - - [128, 128, 36, 392] - - [535, 6175.61] + - [630, 6175.61] - - [80, 192, 36, 10368] - - [583, 6497.26] + - [678, 6497.26] - - [224, 224, 36, 128] - - [593, 5826.99] + - [688, 5826.99] - - [512, 512, 64, 28] - - [548, 5728.91] + - [643, 5728.91] - - [256, 16, 64, 1568] - - [563, 4637.3] + - [658, 4637.3] - - [144, 288, 64, 1152] - - [580, 7784.34] + - [675, 7784.34] - - [256, 256, 64, 576] - - [557, 9596.22] + - [652, 9596.22] - - [64, 128, 36, 784] - - [593, 6059.09] + - [688, 6059.09] - - [256, 24, 36, 128] - - [527, 2239.94] + - [622, 2239.94] - - [256, 256, 64, 2304] - - [557, 10225.8] + - [652, 10225.8] - - [192, 384, 36, 512] - - [592, 8549.13] + - [687, 8549.13] - - [16, 32, 64, 2560] - - [545, 2153.23] + - [640, 2153.23] - - [256, 512, 36, 32] - - [570, 5702.33] + - [665, 5702.33] - - [512, 512, 64, 128] - - [592, 9084.21] + - [687, 9084.21] - - [128, 128, 64, 200] - - [529, 6972.01] + - [624, 6972.01] - - [512, 512, 64, 32] - - [541, 6248.6] + - [636, 6248.6] - - [128, 256, 36, 196] - - [541, 6628.86] + - [636, 6628.86] - - [8, 384, 64, 6600] - - [573, 2733.99] + - [668, 2733.99] - - [149, 32, 64, 8195] - - [535, 6051.01] + - [630, 6051.01] - - [35, 96, 64, 6160] - - [580, 4689.45] + - [675, 4689.45] - - [64, 64, 36, 1760] - - [530, 5622.34] + - [625, 5622.34] + - - [196, 528, 32, 32] + - [708, 4088.51] + - - [5329, 64, 32, 80] + - [701, 8331.24] + - - [64, 2880, 1, 320] + - [752, 4362.7] + - - [49, 832, 32, 256] + - [715, 5618.73] + - - [3136, 64, 64, 64] + - [701, 8457.75] + - - [196, 512, 32, 24] + - [702, 3621.83] + - - [289, 1120, 1, 160] + - [698, 3302.96] + - - [1225, 192, 32, 32] + - [706, 6194.67] + - - [64, 2048, 32, 384] + - [729, 9541.64] + - - [1001, 1536, 1, 32] + - [700, 3575.77] + - - [289, 1792, 1, 320] + - [723, 5140.43] + - - [3136, 256, 64, 64] + - [724, 9310.22] + - - [1001, 1024, 1, 32] + - [695, 2733.5] + - - [196, 480, 32, 64] + - [756, 5070.52] + - - [64, 1728, 1, 320] + - [753, 3205.67] + - - [49, 832, 32, 160] + - [757, 4988.92] + - - [49, 2048, 64, 512] + - [727, 7370.41] + - - [49, 832, 32, 384] + - [715, 5902.05] + - - [289, 896, 1, 192] + - [741, 3452.69] + - - [289, 1024, 32, 384] + - [760, 8902.52] + - - [784, 192, 32, 96] + - [771, 7853.73] + - - [50176, 256, 1, 128] + - [734, 9041.93] + - - [289, 1024, 32, 256] + - [769, 8660.82] + - - [289, 1024, 32, 192] + - [758, 8433.45] + - - [12544, 512, 1, 256] + - [718, 9187.44] + - - [1225, 1728, 1, 192] + - [722, 7720.95] + - - [196, 480, 32, 96] + - [767, 5662.6] + - - [196, 512, 32, 144] + - [761, 6531.48] + - - [784, 400, 1, 32] + - [696, 1280.1] + - - [289, 768, 32, 128] + - [762, 7913.71] + - - [5329, 576, 1, 96] + - [705, 7563.56] + - - [49, 1200, 1, 128] + - [749, 1011.71] + - - [64, 1536, 32, 256] + - [763, 9159.64] + - - [289, 2592, 1, 384] + - [731, 6002.81] + - - [196, 528, 32, 128] + - [766, 5987.2] + - - [64, 2048, 32, 448] + - [729, 9669.97] + - - [196, 1024, 64, 256] + - [768, 7819.04] + - - [5329, 448, 1, 64] + - [701, 6201.12] + - - [784, 256, 32, 64] + - [703, 7623.28] + - - [784, 192, 32, 32] + - [708, 5874.36] + - - [21609, 288, 1, 32] + - [721, 5296.6] + - - [784, 256, 32, 32] + - [699, 6235.56] + - - [5041, 720, 1, 192] + - [717, 8141.08] + - - [289, 2016, 1, 256] + - [714, 5404.15] + - - [196, 512, 32, 128] + - [759, 6366.92] + - - [289, 768, 32, 160] + - [761, 8253.98] + - - [64, 1536, 32, 384] + - [732, 9508.6] + - - [64, 1280, 32, 320] + - [732, 9070.83] + - - [289, 896, 1, 128] + - [742, 2917.78] + - - [289, 3456, 1, 384] + - [722, 7275.01] + - - [196, 800, 1, 64] + - [744, 1393.88] + - - [64, 1280, 32, 384] + - [728, 9225.11] + - - [64, 1344, 1, 512] + - [747, 3041.55] + - - [1001, 4096, 1, 512] + - [728, 9391.87] + - - [1225, 192, 32, 64] + - [701, 7729.39] + - - [64, 1152, 1, 384] + - [751, 2440.75] + - - [729, 1600, 1, 192] + - [713, 6827.81] + - - [289, 1344, 1, 192] + - [711, 4439.14] + - - [784, 192, 32, 16] + - [738, 3663.14] + - - [3136, 1024, 1, 2048] + - [720, 9071.87] + - - [64, 1152, 1, 448] + - [748, 2564.55] + - - [49, 832, 32, 128] + - [711, 4733.26] + - - [784, 256, 32, 128] + - [724, 8471.7] + - - [49, 800, 1, 128] + - [746, 633.635] + - - [196, 512, 32, 32] + - [708, 4354.36] + - - [1225, 384, 32, 96] + - [725, 8751.73] + - - [5041, 576, 1, 96] + - [707, 7067.73] + - - [49, 832, 32, 48] + - [740, 3316.82] + - - [3136, 64, 64, 256] + - [762, 9722.0] + - - [5329, 160, 32, 64] + - [764, 8159.94] + - - [1225, 288, 32, 48] + - [754, 6673.75] + - - [4096, 9216, 1, 512] + - [736, 10117.0] + - - [196, 480, 32, 192] + - [765, 6388.56] + - - [64, 1152, 1, 256] + - [752, 1982.7] + - - [3136, 1024, 1, 512] + - [720, 8745.67] + - - [49, 832, 32, 32] + - [739, 2717.97] + - - [784, 192, 32, 64] + - [703, 7216.42] + - - [289, 1024, 32, 128] + - [726, 7970.6] + - - [289, 768, 32, 192] + - [770, 8327.37] + - - [289, 1120, 1, 192] + - [710, 3717.0] + - - [196, 512, 32, 112] + - [716, 6252.91] + - - [1001, 2048, 1, 32] + - [704, 4000.19] + - - [1225, 288, 32, 64] + - [764, 7208.14] + - - [196, 600, 1, 64] + - [743, 1094.05] + - - [1225, 384, 32, 192] + - [725, 9332.76] + - - [50176, 256, 1, 512] + - [735, 9833.64] + - - [196, 512, 32, 160] + - [762, 6614.44] + - - [4096, 4096, 1, 512] + - [733, 10032.3] + - - [49, 832, 32, 192] + - [711, 5244.63] + - - [1225, 256, 32, 64] + - [701, 7972.45] + - - [64, 2048, 32, 320] + - [729, 9404.37] + - - [196, 480, 32, 16] + - [755, 2724.59] + - - [1225, 256, 32, 48] + - [703, 7100.48] + - - [64, 1280, 32, 448] + - [728, 9344.51] + - - [1225, 1200, 1, 64] + - [697, 5157.99] + - - [1225, 384, 32, 64] + - [701, 8220.06] + - - [12544, 512, 1, 1024] + - [720, 9672.82] + - - [64, 1280, 32, 192] + - [716, 8525.11] + - - [196, 512, 32, 64] + - [701, 5489.44] + - - [289, 1792, 1, 256] + - [719, 4831.71] + - - [196, 528, 32, 256] + - [737, 6453.92] + - - [49, 512, 64, 2048] + - [772, 7549.08] + - - [64, 2048, 32, 192] + - [724, 8955.91] + - - [784, 512, 64, 128] + - [724, 9160.83] + - - [784, 128, 64, 512] + - [731, 9280.79] + - - [196, 528, 32, 160] + - [765, 6161.25] + - - [1225, 192, 32, 48] + - [701, 7237.02] + - - [64, 1728, 1, 192] + - [751, 2480.67] + - - [1001, 2048, 1, 64] + - [777, 5714.52] + - - [5329, 64, 128, 80] + - [784, 8835.39] + - - [64, 1280, 128, 448] + - [782, 10020.6] + - - [289, 768, 128, 128] + - [785, 8542.81] + - - [1225, 192, 128, 64] + - [774, 8444.87] + - - [1225, 288, 128, 48] + - [787, 7244.76] + - - [289, 768, 128, 192] + - [789, 8794.59] + - - [289, 768, 128, 160] + - [786, 8705.43] + - - [64, 2048, 128, 192] + - [780, 9780.36] + - - [64, 1280, 128, 384] + - [783, 9951.0] + - - [1225, 256, 128, 48] + - [775, 8273.71] + - - [1225, 192, 128, 48] + - [775, 8140.42] + - - [1225, 288, 128, 64] + - [787, 7886.31] + - - [64, 1280, 128, 320] + - [779, 9894.66] + - - [1225, 256, 128, 64] + - [780, 8572.61] + - - [1001, 2048, 1, 128] + - [781, 7289.16] + - - [1225, 192, 128, 32] + - [776, 7104.67] + - - [64, 1280, 128, 192] + - [788, 9642.18] + - - [1001, 1536, 1, 64] + - [778, 5146.66] - null diff --git a/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bljk_SB.yaml index 096950937..3a6e9917c 100644 --- a/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bljk_SB.yaml @@ -98605,15 +98605,15 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -98621,7 +98621,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -98629,37 +98629,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 4 - LSPB: 16 + LSPB: 32 LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -98667,30 +98668,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -98698,6 +98708,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -98707,6 +98718,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -98716,53 +98728,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 621 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -98770,7 +98793,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -98778,37 +98801,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 32 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 4 - LSPB: 16 + LSPB: 32 LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -98816,30 +98840,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -98847,6 +98880,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -98856,6 +98890,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -98865,47 +98900,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 622 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -98917,9 +98963,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -98927,75 +98973,82 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 4 LSPB: 16 - LVCA: 32 + LVCA: 64 LVCB: 16 - LVPA: 2 + LVPA: 4 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdcEqualsLdd: true + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99005,6 +99058,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -99014,95 +99068,111 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 623 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id004 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2560 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -99111,36 +99181,46 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99150,6 +99230,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -99159,95 +99240,111 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 624 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4352 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -99255,37 +99352,47 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 8 - MacroTileA: 128 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99295,6 +99402,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -99304,53 +99412,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 625 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id003 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -99359,7 +99478,7 @@ ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -99367,63 +99486,73 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 16 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4352 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 8 - MacroTileA: 128 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: @@ -99431,6 +99560,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99440,6 +99570,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -99449,28 +99580,38 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 626 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 - SubGroup0: 32 - SubGroup1: 2 - SubGroupA: 32 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -99481,30 +99622,31 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id007 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -99512,70 +99654,85 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 16 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 64 LVCA: 16 - LVCB: 32 + LVCB: 4 LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4096 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99585,6 +99742,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -99594,28 +99752,38 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 627 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 2 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -99626,98 +99794,107 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id006 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 16 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3088 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -99725,6 +99902,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99734,6 +99912,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -99743,28 +99922,38 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 628 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 2 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -99774,64 +99963,72 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id006 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 32 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2304 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -99840,36 +100037,44 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99879,6 +100084,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -99888,62 +100094,75 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 629 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id008 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -99951,70 +100170,83 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4352 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 8 - MacroTileA: 128 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100024,6 +100256,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -100033,28 +100266,38 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 630 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 - SubGroup0: 32 - SubGroup1: 2 - SubGroupA: 32 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -100065,21 +100308,24 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id007 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -100087,7 +100333,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -100095,75 +100341,86 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 32 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100173,6 +100430,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -100182,133 +100440,159 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 631 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG08_04_08 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 2560 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100318,6 +100602,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -100327,53 +100612,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 632 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id008 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -100381,7 +100677,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -100389,33 +100685,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 4352 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3136 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -100424,29 +100721,38 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 4 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 4 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: @@ -100454,6 +100760,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100463,6 +100770,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -100472,95 +100780,111 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 633 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_02_04 - SubGroup0: 32 - SubGroup1: 2 - SubGroupA: 32 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id008 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id007 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 8 LSPA: 8 - LSPB: 4 + LSPB: 128 LVCA: 32 - LVCB: 64 + LVCB: 2 LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 4352 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -100569,36 +100893,46 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 4 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 4 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100608,6 +100942,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -100617,53 +100952,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 634 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_02_08 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 2 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id006 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -100671,45 +101017,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 48 - LSCB: 16 + LSCA: 128 + LSCB: 8 LSPA: 4 - LSPB: 12 - LVCA: 48 - LVCB: 16 - LVPA: 4 - LVPB: 12 - LdcEqualsLdd: false - LdsNumElements: 3456 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 576 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -100717,30 +101064,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 36 - MacroTileA: 48 - MacroTileB: 36 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 192 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -100748,6 +101104,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100757,6 +101114,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -100766,47 +101124,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 635 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x036x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT06_03_USFGRO01_VW01_WG08_12_02 - SubGroup0: 8 - SubGroup1: 12 - SubGroupA: 8 - SubGroupB: 12 - SuppresssNoLoadLoop: false - ThreadTile: [6, 3] - ThreadTile0: 6 - ThreadTile1: 3 - ThreadTileA: 6 - ThreadTileB: 3 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 635 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 12, 2] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -100818,41 +101187,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 12 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 12 - LVCA: 12 - LVCB: 16 - LVPA: 16 - LVPB: 12 - LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 768 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 576 - LdsOffsetB_Blk: 2624 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -100866,37 +101232,47 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 36 - MacroTile1: 48 - MacroTileA: 36 - MacroTileB: 48 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 192 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100906,6 +101282,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -100915,53 +101292,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 636 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT036x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG12_16_01 - SubGroup0: 12 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 12 + SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 3 - ThreadTile1: 3 - ThreadTileA: 3 - ThreadTileB: 3 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [12, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -100969,39 +101357,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true LdsNumElements: 3584 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 768 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -101014,38 +101403,48 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 48 - MacroTile1: 48 - MacroTileA: 48 - MacroTileB: 48 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101055,6 +101454,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -101064,53 +101464,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 637 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG16_16_01 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 3 - ThreadTile1: 3 - ThreadTileA: 3 - ThreadTileB: 3 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -101118,7 +101529,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -101126,37 +101537,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 48 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 12 - LVCA: 24 - LVCB: 16 - LVPA: 4 - LVPB: 6 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 768 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -101164,30 +101576,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 24 - MacroTileA: 48 - MacroTileB: 24 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 6 - NumGlobalWriteVectorsPerThread: 3 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101195,6 +101616,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101204,6 +101626,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -101213,53 +101636,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 638 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 - SubGroup0: 8 - SubGroup1: 6 - SubGroupA: 8 - SubGroupB: 6 - SuppresssNoLoadLoop: false - ThreadTile: *id011 - ThreadTile0: 6 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id010 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -101267,45 +101701,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 24 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 6 - LVCA: 24 - LVCB: 32 - LVPA: 8 - LVPB: 6 - LdcEqualsLdd: false + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true LdsNumElements: 3584 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 768 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -101313,30 +101748,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 24 - MacroTile1: 24 - MacroTileA: 24 - MacroTileB: 24 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 3 - NumGlobalWriteVectorsPerThread: 3 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101344,6 +101788,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101353,6 +101798,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -101362,53 +101808,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 639 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT024x024x32_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_04_USFGRO01_VW01_WG08_06_04 - SubGroup0: 8 - SubGroup1: 6 - SubGroupA: 8 - SubGroupB: 6 - SuppresssNoLoadLoop: false - ThreadTile: [3, 4] - ThreadTile0: 3 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 3 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id010 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -101416,7 +101873,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -101424,37 +101881,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 48 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 12 - LVCA: 24 - LVCB: 16 - LVPA: 4 - LVPB: 6 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 768 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -101462,37 +101920,47 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 24 - MacroTileA: 48 - MacroTileB: 24 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 6 - NumGlobalWriteVectorsPerThread: 3 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101502,6 +101970,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -101511,49 +101980,60 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 640 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 - SubGroup0: 8 - SubGroup1: 6 - SubGroupA: 8 - SubGroupB: 6 - SuppresssNoLoadLoop: false - ThreadTile: *id011 - ThreadTile0: 6 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id010 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -101563,43 +102043,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 8 - LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 832 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101611,30 +102092,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101642,6 +102132,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101651,6 +102142,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -101660,95 +102152,107 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 641 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 8 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 LSPA: 8 - LSPB: 16 - LVCA: 8 + LSPB: 64 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101759,31 +102263,40 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101791,6 +102304,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101800,6 +102314,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -101809,49 +102324,60 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 642 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -101861,41 +102387,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 8 - LVCB: 2 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 LVPA: 2 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -101909,30 +102436,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101940,6 +102474,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101949,6 +102484,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -101958,49 +102494,62 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 643 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -102010,41 +102559,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 8 LSPA: 4 - LSPB: 32 - LVCA: 16 - LVCB: 2 - LVPA: 1 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -102058,30 +102608,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102089,6 +102646,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102098,6 +102656,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -102107,49 +102666,62 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 644 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -102159,41 +102731,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -102207,30 +102780,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102238,6 +102818,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102247,6 +102828,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -102256,49 +102838,62 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 645 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -102308,41 +102903,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 8 - LSPA: 4 - LSPB: 16 + LSPA: 8 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -102356,30 +102952,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102387,6 +102990,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102396,6 +103000,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -102405,49 +103010,62 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 646 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -102457,9 +103075,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -102467,33 +103085,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 16 - LVCA: 8 + LSPB: 64 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102505,30 +103124,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102536,6 +103162,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102545,6 +103172,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -102554,49 +103182,62 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 647 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -102606,41 +103247,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 8 - LSPA: 4 - LSPB: 16 - LVCA: 16 - LVCB: 4 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 LVPA: 2 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -102654,30 +103296,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102685,6 +103336,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102694,6 +103346,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -102703,49 +103356,60 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 648 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -102755,47 +103419,22670 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 649 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 650 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 LVCB: 2 LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 651 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1824 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 652 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1824 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 653 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 800 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 654 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1680 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 192 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 655 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 656 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 657 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 1 + LSPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1296 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 658 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB2_PGR0_PLR1_TT8_4_USFGRO1_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 1 + LSPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1312 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 659 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 1 + LSPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1312 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 660 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 661 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR0_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 662 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 663 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 664 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 665 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 32 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 666 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 667 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 668 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 669 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 670 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 671 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 672 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 673 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 674 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 675 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 676 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW2_GSU8_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 677 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 678 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 679 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 680 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 681 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 682 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 683 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 684 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 685 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 686 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 687 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 688 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 689 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 690 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 691 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 692 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 693 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 694 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 695 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 696 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 697 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 698 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 699 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 700 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 + SubGroup0: 32 + SubGroup1: 2 + SubGroupA: 32 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id007 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 701 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 + SubGroup0: 16 + SubGroup1: 2 + SubGroupA: 16 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 702 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 + SubGroup0: 16 + SubGroup1: 2 + SubGroupA: 16 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2304 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 703 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id008 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 704 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 + SubGroup0: 32 + SubGroup1: 2 + SubGroupA: 32 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id007 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 705 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG08_04_08 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 706 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id008 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 707 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_02_04 + SubGroup0: 32 + SubGroup1: 2 + SubGroupA: 32 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id008 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id007 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 708 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_02_08 + SubGroup0: 16 + SubGroup1: 2 + SubGroupA: 16 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 48 + LSCB: 16 + LSPA: 4 + LSPB: 12 + LVCA: 48 + LVCB: 16 + LVPA: 4 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 3456 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 36 + MacroTileA: 48 + MacroTileB: 36 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 709 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x036x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT06_03_USFGRO01_VW01_WG08_12_02 + SubGroup0: 8 + SubGroup1: 12 + SubGroupA: 8 + SubGroupB: 12 + SuppresssNoLoadLoop: false + ThreadTile: [6, 3] + ThreadTile0: 6 + ThreadTile1: 3 + ThreadTileA: 6 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 12, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 12 + LSCB: 16 + LSPA: 16 + LSPB: 12 + LVCA: 12 + LVCB: 16 + LVPA: 16 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 3392 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 36 + MacroTile1: 48 + MacroTileA: 36 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 710 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT036x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG12_16_01 + SubGroup0: 12 + SubGroup1: 16 + SubGroupA: 12 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 3 + ThreadTile1: 3 + ThreadTileA: 3 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [12, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 48 + MacroTile1: 48 + MacroTileA: 48 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 711 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 3 + ThreadTile1: 3 + ThreadTileA: 3 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 48 + LSCB: 32 + LSPA: 8 + LSPB: 12 + LVCA: 24 + LVCB: 16 + LVPA: 4 + LVPB: 6 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 24 + MacroTileA: 48 + MacroTileB: 24 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 712 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 + SubGroup0: 8 + SubGroup1: 6 + SubGroupA: 8 + SubGroupB: 6 + SuppresssNoLoadLoop: false + ThreadTile: *id011 + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id010 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 24 + LSCB: 32 + LSPA: 8 + LSPB: 6 + LVCA: 24 + LVCB: 32 + LVPA: 8 + LVPB: 6 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 24 + MacroTile1: 24 + MacroTileA: 24 + MacroTileB: 24 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 3 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 713 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT024x024x32_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_04_USFGRO01_VW01_WG08_06_04 + SubGroup0: 8 + SubGroup1: 6 + SubGroupA: 8 + SubGroupB: 6 + SuppresssNoLoadLoop: false + ThreadTile: [3, 4] + ThreadTile0: 3 + ThreadTile1: 4 + ThreadTileA: 3 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: *id010 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 48 + LSCB: 32 + LSPA: 8 + LSPB: 12 + LVCA: 24 + LVCB: 16 + LVPA: 4 + LVPB: 6 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 24 + MacroTileA: 48 + MacroTileB: 24 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 714 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 + SubGroup0: 8 + SubGroup1: 6 + SubGroupA: 8 + SubGroupB: 6 + SuppresssNoLoadLoop: false + ThreadTile: *id011 + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id010 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 715 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 716 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 717 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 718 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 719 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 720 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 721 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 722 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 723 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 724 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 725 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 726 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 727 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 728 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 729 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 730 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 731 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 732 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 384 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 6 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 733 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x24_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 734 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 735 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 736 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 737 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 738 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 739 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id025 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 740 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 741 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 742 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 743 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 744 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 745 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 746 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 747 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id024 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 748 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id025 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 749 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 750 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 751 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 752 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 753 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 754 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 755 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 756 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 757 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 758 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 759 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id025 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 760 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 761 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 762 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 763 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 764 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 765 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 766 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 767 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 768 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 769 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 770 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 771 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 772 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 773 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 774 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 775 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 776 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 777 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 778 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id024 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 779 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 780 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 781 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 782 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 783 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 784 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 785 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 786 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 787 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 788 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 789 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 790 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 791 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 792 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 793 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -102804,9 +126091,299 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 794 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 795 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102822,12 +126399,12 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -102866,14 +126443,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 649 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 796 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id016 + ThreadTile: *id029 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -102884,8 +126461,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 8 + WorkGroup: *id028 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -102898,7 +126475,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102906,39 +126483,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 16 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -102951,11 +126528,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102963,15 +126540,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -103015,26 +126592,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 650 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 + SolutionIndex: 797 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -103047,7 +126624,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -103055,39 +126632,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 2 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103100,11 +126677,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103112,15 +126689,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -103164,26 +126741,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 651 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SolutionIndex: 798 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id015 + ThreadTile: *id031 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -103205,7 +126782,7 @@ ExpandPointerSwap: false FractionalLoad: false GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -103218,25 +126795,21 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 64 + LSCA: 32 LSCB: 16 - LSPA: 4 - LSPB: 8 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3200 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3584 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103250,10 +126823,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103261,20 +126834,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103313,25 +126886,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 652 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 + SolutionIndex: 799 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id032 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [32, 4, 1] + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -103353,39 +126926,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 16 + LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103399,10 +126968,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103410,20 +126979,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103462,25 +127031,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 653 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 800 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -103502,39 +127071,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 32 + LSCA: 128 LSCB: 16 - LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103548,10 +127117,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103559,15 +127128,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -103611,25 +127180,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 654 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 801 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -103668,22 +127237,18 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 8 + LSPA: 16 + LSPB: 64 + LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103697,10 +127262,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103710,18 +127275,18 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103760,14 +127325,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 655 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 802 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id016 + ThreadTile: *id029 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -103778,8 +127343,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 1 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -103800,35 +127365,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source LSCA: 64 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -103847,9 +127412,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103857,15 +127422,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -103909,26 +127474,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 656 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SolutionIndex: 803 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id015 + ThreadTile: *id029 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -103968,20 +127533,20 @@ KernelLanguage: Source LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 32 + LSPA: 16 + LSPB: 64 LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103996,9 +127561,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104006,15 +127571,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -104058,26 +127623,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 657 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_08_01 + SolutionIndex: 804 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id016 + ThreadTile: *id031 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -104115,22 +127680,18 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 64 + LSCA: 32 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 LVCB: 8 - LVPA: 2 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -104144,10 +127705,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104155,20 +127716,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -104207,25 +127768,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 658 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SolutionIndex: 805 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id032 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 + WorkGroup: *id028 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -104239,7 +127800,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -104247,39 +127808,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 32 - LSCB: 8 - LSPA: 4 - LSPB: 16 - LVCA: 16 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3200 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 384 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -104292,11 +127853,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104304,15 +127865,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 6 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 6 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -104356,26 +127917,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 659 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x24_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 806 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -104396,39 +127957,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 4 - LSPB: 4 + LSPA: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -104442,10 +127999,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104453,20 +128010,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -104505,96 +128062,92 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 660 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 + SolutionIndex: 807 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 + VectorWidth: 4 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 - LSPB: 16 + KernelLanguage: Source + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104602,20 +128155,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -104654,48 +128207,48 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 661 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 808 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id031 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 + VectorWidth: 4 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: false GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -104703,47 +128256,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly + KernelLanguage: Source LSCA: 128 - LSCB: 8 + LSCB: 32 LSPA: 8 LSPB: 32 LVCA: 32 LVCB: 8 LVPA: 2 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 4096 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 32 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104751,20 +128300,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -104803,96 +128352,92 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 662 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_08_02 + SolutionIndex: 809 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id021 + ThreadTile: *id030 ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id024 - WorkGroupMapping: 8 + WorkGroup: *id028 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 - LSPB: 16 + KernelLanguage: Source + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104900,20 +128445,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -104952,96 +128497,96 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 663 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 810 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 - LSPB: 16 + KernelLanguage: Source + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105049,15 +128594,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -105101,26 +128646,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 664 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 811 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -105133,7 +128678,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -105142,7 +128687,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -105150,36 +128695,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 - LSPB: 16 + LSPB: 128 LVCA: 32 - LVCB: 16 + LVCB: 2 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -105188,9 +128733,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105198,13 +128743,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -105250,25 +128795,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 665 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionIndex: 812 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id025 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -105282,7 +128827,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -105291,7 +128836,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -105299,26 +128844,26 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -105328,18 +128873,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105347,8 +128892,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -105399,26 +128944,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 666 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 813 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -105437,58 +128982,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105496,20 +129037,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -105548,25 +129089,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 667 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 814 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id035 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -105588,56 +129129,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 16 + LSPA: 16 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105645,15 +129186,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -105697,25 +129238,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 668 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SolutionIndex: 815 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id035 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -105735,7 +129276,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -105746,7 +129287,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -105754,39 +129295,35 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 8 + LSPB: 64 + LVCA: 16 LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105794,20 +129331,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -105846,25 +129383,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 669 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 816 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 + ThreadTile: *id036 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -105895,7 +129432,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -105905,37 +129442,37 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 32 + LSPA: 16 + LSPB: 64 LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105943,15 +129480,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -105995,25 +129532,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 670 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SolutionIndex: 817 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -106035,8 +129572,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -106044,47 +129581,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106092,14 +129629,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -106144,25 +129681,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 671 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 818 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id038 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id023 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -106182,58 +129719,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106241,20 +129774,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -106293,25 +129826,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 672 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 819 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -106333,56 +129866,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 16 + LSPB: 64 LVCA: 32 - LVCB: 16 - LVPA: 4 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106390,8 +129923,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -106442,25 +129975,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 673 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SolutionIndex: 820 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id024 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -106482,35 +130015,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6400 + LdsNumElements: 7680 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -106520,18 +130053,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 16 + MacroTile1: 96 MacroTileA: 128 - MacroTileB: 16 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106539,14 +130072,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -106591,26 +130124,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 674 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionIndex: 821 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id039 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id025 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -106629,10 +130162,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -106640,47 +130173,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106688,20 +130217,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -106740,26 +130269,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 675 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 822 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -106781,7 +130310,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -106789,26 +130318,26 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 16 + LSPB: 64 LVCA: 32 - LVCB: 16 + LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -106818,18 +130347,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106837,14 +130366,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -106889,26 +130418,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 676 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 823 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id021 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -106927,58 +130456,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 16 + LSPA: 16 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106986,20 +130511,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -107038,25 +130563,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 677 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 824 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 + ThreadTile: *id035 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -107087,7 +130612,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -107095,39 +130620,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 8 + LSPB: 64 + LVCA: 16 LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107135,15 +130660,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -107187,14 +130712,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 678 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 + SolutionIndex: 825 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id035 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -107205,7 +130730,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -107225,7 +130750,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -107236,7 +130761,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -107246,37 +130771,33 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 32 + LSPA: 16 + LSPB: 64 LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107284,20 +130805,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -107336,25 +130857,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 679 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 826 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -107376,56 +130897,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107433,14 +130954,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -107485,25 +131006,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 680 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 827 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -107526,7 +131047,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -107534,47 +131055,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107582,14 +131103,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -107634,25 +131155,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 681 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SolutionIndex: 828 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id038 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id023 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -107672,58 +131193,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107731,20 +131248,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -107783,25 +131300,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 682 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_08_02 + SolutionIndex: 829 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id024 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -107823,56 +131340,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 16 + LSPB: 64 LVCA: 32 - LVCB: 16 - LVPA: 4 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107880,8 +131397,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -107932,25 +131449,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 683 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SolutionIndex: 830 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id024 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -107972,35 +131489,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 + LdsNumElements: 7680 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -108010,18 +131527,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108029,14 +131546,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -108081,25 +131598,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 684 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 + SolutionIndex: 831 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id027 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id039 ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id024 + VectorWidth: 2 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -108119,10 +131636,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -108130,47 +131647,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 16 + LSPB: 64 LVCA: 32 - LVCB: 16 + LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108178,20 +131691,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -108230,26 +131743,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 685 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 + SolutionIndex: 832 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id025 - WorkGroupMapping: 1 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -108270,56 +131783,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108327,15 +131840,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -108379,26 +131892,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 686 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SolutionIndex: 833 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -108411,13 +131924,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -108428,7 +131941,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -108436,39 +131949,35 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 8 - LVCB: 4 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108476,20 +131985,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -108528,25 +132037,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 687 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 834 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -108560,7 +132069,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -108577,7 +132086,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -108585,39 +132094,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 + LSPB: 128 + LVCA: 32 + LVCB: 2 LVPA: 2 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 3072 + LdsNumElementsAlignedB: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 3072 + LdsOffsetB_Blk: 11264 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108625,15 +132134,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -108677,25 +132186,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 688 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id021 + SolutionIndex: 835 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -108709,64 +132218,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108774,20 +132279,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -108826,26 +132331,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 689 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 836 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -108858,7 +132363,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -108866,56 +132371,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 3072 + LdsNumElementsAlignedB: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 3072 + LdsOffsetB_Blk: 11264 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108923,13 +132428,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -108975,26 +132480,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 690 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SolutionIndex: 837 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -109007,16 +132512,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -109024,47 +132529,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109072,20 +132573,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -109124,14 +132625,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 691 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 838 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 + ThreadTile: *id035 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -109142,8 +132643,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -109156,7 +132657,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -109173,7 +132674,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -109181,39 +132682,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 LSPB: 32 - LVCA: 8 - LVCB: 4 + LVCA: 16 + LVCB: 8 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109221,15 +132722,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -109273,14 +132774,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 692 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 + SolutionIndex: 839 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id035 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -109291,8 +132792,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -109305,64 +132806,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 4096 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109370,20 +132867,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -109422,26 +132919,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 693 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 840 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -109454,7 +132951,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -109462,56 +132959,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109519,14 +133016,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -109571,25 +133068,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 694 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 841 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -109603,13 +133100,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -109620,7 +133117,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -109628,39 +133125,35 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 + LSCA: 128 + LSCB: 32 LSPA: 8 - LSPB: 64 - LVCA: 16 - LVCB: 2 + LSPB: 32 + LVCA: 32 + LVCB: 8 LVPA: 2 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 4096 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109668,20 +133161,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -109720,14 +133213,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 695 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 842 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id027 + ThreadTile: *id033 ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 @@ -109738,7 +133231,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -109752,7 +133245,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -109769,7 +133262,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -109777,39 +133270,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 + LSCA: 128 + LSCB: 32 LSPA: 8 - LSPB: 64 - LVCA: 16 - LVCB: 2 + LSPB: 32 + LVCA: 32 + LVCB: 8 LVPA: 2 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109817,15 +133310,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -109869,14 +133362,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 696 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id027 + SolutionIndex: 843 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 @@ -109887,8 +133380,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -109907,58 +133400,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 32 - LVPA: 8 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109966,20 +133455,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -110018,26 +133507,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 697 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SolutionIndex: 844 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id035 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -110058,35 +133547,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6400 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -110096,18 +133585,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110115,14 +133604,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -110167,26 +133656,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 698 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 + SolutionIndex: 845 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id035 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -110205,7 +133694,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -110216,7 +133705,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -110233,30 +133722,26 @@ LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110264,20 +133749,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -110316,26 +133801,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 699 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 846 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id027 + ThreadTile: *id037 ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -110356,56 +133841,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 32 LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110413,15 +133898,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -110465,26 +133950,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 700 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SolutionIndex: 847 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -110503,58 +133988,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 4096 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110562,20 +134043,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -110614,26 +134095,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 701 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 848 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -110663,7 +134144,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -110671,39 +134152,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 32 - LSPA: 32 + LSPA: 8 LSPB: 32 - LVCA: 8 + LVCA: 32 LVCB: 8 - LVPA: 8 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110711,14 +134192,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -110763,96 +134244,96 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 702 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionIndex: 849 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id023 - WorkGroupMapping: 1 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 4 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 + KernelLanguage: Source + LSCA: 16 + LSCB: 4 + LSPA: 4 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110860,15 +134341,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -110912,96 +134393,96 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 703 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 + SolutionIndex: 850 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id023 + VectorWidth: 2 + WorkGroup: *id040 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 2 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 + KernelLanguage: Source + LSCA: 32 + LSCB: 2 + LSPA: 2 + LSPB: 32 LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 512 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111009,15 +134490,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111061,25 +134542,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 704 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 - SubGroup0: 16 + SolutionIndex: 851 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x02_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id022 + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id024 + VectorWidth: 4 + WorkGroup: *id040 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -111093,7 +134574,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -111101,7 +134582,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -111110,30 +134591,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -111146,11 +134627,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111160,11 +134641,11 @@ NonTemporalC: 0 NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -111210,26 +134691,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 705 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + SolutionIndex: 852 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: true + ThreadTile: *id041 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: *id044 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -111242,7 +134723,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 4 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -111250,56 +134731,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 + LSCA: 16 + LSCB: 4 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111307,15 +134788,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111359,26 +134840,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 706 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 853 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id027 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: *id042 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -111391,7 +134872,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -111399,8 +134880,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -111408,30 +134889,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -111444,11 +134925,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111456,15 +134937,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111508,26 +134989,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 707 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 854 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 + ThreadTile: *id041 + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 8 + WorkGroup: *id043 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -111540,7 +135021,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 4 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -111548,56 +135029,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 4 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111606,14 +135087,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111657,26 +135138,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 708 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SolutionIndex: 855 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id023 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: *id042 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -111689,7 +135170,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -111697,39 +135178,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -111742,11 +135223,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111754,15 +135235,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111806,26 +135287,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 709 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 856 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id023 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: *id043 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -111838,7 +135319,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -111846,39 +135327,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 LVCA: 16 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -111891,11 +135372,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111903,15 +135384,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111955,26 +135436,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 710 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_08_02 - SubGroup0: 16 + SolutionIndex: 857 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id024 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -111987,7 +135468,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -111995,56 +135476,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 + LSCA: 8 + LSCB: 8 LSPA: 8 - LSPB: 32 - LVCA: 32 + LSPB: 8 + LVCA: 8 LVCB: 8 - LVPA: 2 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112052,15 +135533,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -112104,26 +135585,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 711 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SolutionIndex: 858 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id027 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id024 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: *id043 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -112136,7 +135617,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -112144,7 +135625,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -112153,30 +135634,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -112189,11 +135670,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112203,11 +135684,11 @@ NonTemporalC: 0 NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -112253,25 +135734,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 712 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 859 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id018 + ThreadTile: *id041 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id026 + WorkGroup: *id044 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -112285,7 +135766,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -112293,7 +135774,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -112301,37 +135782,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 8 + LSPB: 32 LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 8 + LVCB: 8 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3360 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -112339,10 +135820,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112350,13 +135831,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -112402,25 +135883,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 713 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SolutionIndex: 860 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppresssNoLoadLoop: true + ThreadTile: *id045 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 + VectorWidth: 4 + WorkGroup: *id046 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -112434,7 +135915,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -112443,7 +135924,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -112451,36 +135932,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 32 - LVCA: 8 + LVCA: 32 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3360 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -112488,9 +135969,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -112499,8 +135980,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -112551,14 +136032,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 714 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 + SolutionIndex: 861 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 + SuppresssNoLoadLoop: true + ThreadTile: *id045 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -112569,8 +136050,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id023 - WorkGroupMapping: 1 + WorkGroup: *id046 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -112583,7 +136064,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -112591,56 +136072,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 64 LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112648,13 +136129,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -112700,26 +136181,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 715 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 862 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id045 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -112732,7 +136213,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -112749,7 +136230,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -112757,39 +136238,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112797,13 +136278,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -112849,26 +136330,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 716 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 + SolutionIndex: 863 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG32_08_01 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 + SuppresssNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id023 - WorkGroupMapping: 8 + WorkGroup: *id046 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -112881,64 +136362,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2592 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112946,21 +136423,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 8 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -112998,25 +136475,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 717 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 + SolutionIndex: 864 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x128x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG08_32_01 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 32 SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 + ThreadTile: *id047 + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id023 + VectorWidth: 1 + WorkGroup: [8, 32, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -113030,53 +136507,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 8 + LSCA: 128 + LSCB: 16 + LSPA: 2 LSPB: 16 - LVCA: 32 + LVCA: 128 LVCB: 16 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2592 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -113084,10 +136557,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113095,21 +136568,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -113147,25 +136620,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 718 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 - SubGroup0: 16 + SolutionIndex: 865 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW02_WG32_08_01 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id022 + ThreadTile: *id047 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id024 + WorkGroup: *id048 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -113179,15 +136652,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 - DirectToLds: false - DirectToLdsA: false + DepthU: 16 + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -113195,48 +136668,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 256 + LSCB: 16 + LSPA: 1 + LSPB: 16 + LVCA: 256 + LVCB: 16 + LVPA: 1 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 4096 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113244,21 +136713,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -113296,26 +136765,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 719 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 866 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x16_DTL1_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT08_04_USFGRO01_VW02_WG32_08_01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id050 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: *id048 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -113328,15 +136797,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -113344,48 +136813,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 + LSCA: 64 + LSCB: 16 + LSPA: 2 LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1600 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113393,21 +136858,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -113445,77 +136910,77 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 720 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SolutionIndex: 867 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG16_08_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id047 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 2112 LdsOffsetA: 0 LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -113526,11 +136991,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113538,21 +137003,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -113590,35 +137055,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 721 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 868 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: *id047 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: *id049 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -113629,7 +137094,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -113646,7 +137111,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 LSCB: 16 LSPA: 16 @@ -113656,11 +137121,11 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 2112 LdsOffsetA: 0 LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -113697,7 +137162,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -113735,35 +137200,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 722 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 869 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id029 + ThreadTile: *id047 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 1 + WorkGroup: *id049 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -113774,7 +137239,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -113791,25 +137256,21 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + KernelLanguage: Assembly + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3136 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -113821,9 +137282,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -113832,21 +137293,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -113884,79 +137345,77 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 723 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 870 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id029 - ThreadTile0: 4 + ThreadTile: *id050 + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 1 + WorkGroup: *id049 + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -113969,11 +137428,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113982,12 +137441,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 @@ -114033,75 +137490,74 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 724 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 871 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW01_GSU08_PGR1_PLR1_TT08_04_USFGRO01_VW01_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id031 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: *id051 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 + VectorWidth: 1 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 - DirectToLds: false - DirectToLdsA: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 - LSPA: 16 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 1 LSPB: 32 - LVCA: 16 + LVCA: 256 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 1 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2304 LdsOffsetA: 0 - LdsOffsetB: 1536 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114110,15 +137566,15 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -114126,14 +137582,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -114178,75 +137632,78 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 725 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id032 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SolutionIndex: 872 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT08_04_USFGRO01_VW04_WG32_08_01_WGM08 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: *id051 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id028 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id053 + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 128 - LSCB: 16 - LSPA: 8 + LSCB: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114259,7 +137716,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -114272,9 +137729,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -114284,7 +137739,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -114323,79 +137778,78 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 726 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 873 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW02_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id030 + ThreadTile: *id051 ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 + VectorWidth: 2 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 128 - LSCB: 16 - LSPA: 8 + LSCB: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114408,7 +137862,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -114421,9 +137875,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -114472,46 +137924,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 727 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id030 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SolutionIndex: 874 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW02_WG32_08_01_WGM01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: *id054 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 + VectorWidth: 2 + WorkGroup: *id053 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -114528,19 +137979,23 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCB: 8 + LSPA: 8 LSPB: 64 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 2 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114553,7 +138008,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -114565,20 +138020,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -114617,46 +138070,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 728 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 875 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id029 + SubGroupB: 8 + ThreadTile: *id054 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 8 + WorkGroup: *id055 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -114673,23 +138125,23 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCB: 8 + LSPA: 8 LSPB: 64 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 2 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114702,7 +138154,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -114714,15 +138166,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -114766,35 +138216,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 729 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 876 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM08 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id029 + SubGroupB: 8 + ThreadTile: *id054 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: *id055 WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -114804,8 +138254,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -114822,7 +138271,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 LSCB: 16 LSPA: 16 @@ -114832,13 +138281,9 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114865,8 +138310,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -114876,7 +138319,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -114915,35 +138358,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 730 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id031 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SolutionIndex: 877 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: *id051 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 8 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -114953,37 +138396,40 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + KernelLanguage: Assembly + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114997,10 +138443,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -115008,20 +138454,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -115060,35 +138504,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 731 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 + SolutionIndex: 878 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id032 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id028 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id052 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -115098,8 +138542,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -115116,23 +138559,19 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 + KernelLanguage: Assembly + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115146,10 +138585,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -115159,18 +138598,16 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -115209,46 +138646,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 732 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 879 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id030 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: *id054 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 8 + WorkGroup: *id052 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -115265,17 +138701,17 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3072 LdsOffsetA: 0 LdsOffsetB: 2048 LdsPadA: 0 @@ -115290,10 +138726,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -115302,14 +138738,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -115354,46 +138788,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 733 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 880 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id029 - ThreadTile0: 4 + ThreadTile: *id051 + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -115410,19 +138843,23 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115435,11 +138872,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -115449,18 +138886,16 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -115499,46 +138934,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 734 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 881 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id031 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: *id051 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -115555,19 +138989,23 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 16 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115580,7 +139018,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -115594,18 +139032,16 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -115644,46 +139080,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 735 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id030 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SolutionIndex: 882 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: *id054 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: *id053 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -115700,19 +139135,23 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115725,10 +139164,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -115737,20 +139176,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -115789,48 +139226,49 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 736 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id029 + SolutionIndex: 883 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM08 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: *id054 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: *id053 WorkGroupMapping: 8 WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -115843,25 +139281,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115874,10 +139312,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -115886,15 +139324,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -115938,46 +139379,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 737 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 884 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id029 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -115992,25 +139443,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 + LSCB: 16 LSPA: 8 - LSPB: 128 + LSPB: 64 LVCA: 32 - LVCB: 2 + LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -116023,7 +139474,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 128 MacroTileA: 128 @@ -116037,13 +139488,16 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -116087,14 +139541,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 738 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 885 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 @@ -116105,15 +139566,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -116126,9 +139590,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -116141,21 +139605,21 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 64 LVCA: 32 - LVCB: 2 + LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -116174,9 +139638,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116184,15 +139648,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -116236,33 +139703,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 739 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 886 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -116274,8 +139751,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -116290,21 +139767,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -116318,10 +139799,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116329,20 +139810,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -116381,33 +139865,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 740 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 887 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id035 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -116420,7 +139914,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -116435,25 +139929,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -116467,10 +139961,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116478,15 +139972,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -116530,81 +140027,95 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 741 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 888 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id035 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -116613,9 +140124,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116623,20 +140134,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -116675,48 +140189,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 742 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 889 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW2_GSU1_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id036 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -116729,27 +140253,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116760,11 +140284,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116774,13 +140298,16 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -116824,33 +140351,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 743 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 890 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id036 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -116863,57 +140400,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116921,15 +140458,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -116973,33 +140513,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 744 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id038 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 891 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 4, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -117011,8 +140561,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -117027,7 +140577,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 @@ -117039,11 +140589,15 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -117068,18 +140622,21 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -117118,14 +140675,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 745 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 892 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id037 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 @@ -117136,15 +140700,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -117157,9 +140724,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -117172,42 +140739,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117215,15 +140782,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -117267,17 +140837,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 746 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 893 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id037 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -117285,15 +140862,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -117306,57 +140886,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 2 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117364,15 +140944,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -117416,46 +140999,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 747 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 894 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id039 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -117470,27 +141063,31 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPB: 8 + LVCA: 8 + LVCB: 8 LVPA: 2 - LVPB: 16 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -117498,10 +141095,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117509,20 +141106,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -117561,46 +141161,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 748 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 895 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [8, 4, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -117615,31 +141225,31 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7296 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -117647,10 +141257,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117658,15 +141268,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -117710,33 +141323,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 749 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 896 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -117748,8 +141371,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -117764,23 +141387,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -117792,9 +141419,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -117803,20 +141430,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -117855,17 +141485,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 750 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 897 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id035 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -117873,15 +141510,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -117894,9 +141534,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -117909,42 +141549,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 + LVCB: 8 LVPA: 4 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117952,15 +141592,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -118004,14 +141647,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 751 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 898 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id035 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -118022,15 +141672,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -118041,9 +141694,9 @@ DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -118058,7 +141711,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 @@ -118070,26 +141723,30 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118097,20 +141754,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -118149,14 +141809,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 752 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 899 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id036 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 @@ -118167,15 +141834,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -118188,9 +141858,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -118203,42 +141873,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118246,15 +141916,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -118298,85 +141971,95 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 753 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 900 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id036 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -118384,10 +142067,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118395,15 +142078,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -118447,33 +142133,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 754 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 901 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id038 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -118485,8 +142181,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -118501,7 +142197,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 @@ -118513,11 +142209,15 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -118542,18 +142242,21 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -118592,14 +142295,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 755 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 902 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id037 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 @@ -118610,15 +142320,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -118631,9 +142344,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -118646,42 +142359,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118689,15 +142402,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -118741,17 +142457,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 756 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 903 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id037 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -118759,15 +142482,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -118780,57 +142506,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 2 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118838,15 +142564,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -118890,33 +142619,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 757 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 904 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id039 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -118928,10 +142667,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -118944,38 +142683,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118983,20 +142726,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -119035,46 +142781,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 758 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 905 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -119089,31 +142845,31 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7296 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -119121,10 +142877,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119132,15 +142888,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -119184,46 +142943,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 759 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 906 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -119238,23 +143007,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 + LSCB: 16 LSPA: 8 - LSPB: 128 + LSPB: 64 LVCA: 32 - LVCB: 2 + LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -119265,11 +143038,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119277,20 +143050,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -119329,48 +143105,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 760 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 907 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -119383,42 +143169,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 3072 - LdsNumElementsAlignedB: 3072 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 3072 - LdsOffsetB_Blk: 11264 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119426,15 +143212,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -119478,46 +143267,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 761 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 908 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -119532,38 +143331,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119571,20 +143374,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -119623,48 +143429,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 762 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 909 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -119677,25 +143493,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 64 LVCA: 32 - LVCB: 2 + LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 3072 - LdsNumElementsAlignedB: 3072 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 3072 - LdsOffsetB_Blk: 11264 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -119708,11 +143524,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119720,15 +143536,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -119749,6 +143568,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119758,6 +143578,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119772,48 +143593,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 763 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 910 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -119826,21 +143657,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -119853,10 +143688,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -119865,20 +143700,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -119894,6 +143732,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119903,6 +143742,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119917,48 +143757,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 764 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id035 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 911 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -119971,21 +143821,21 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCA: 256 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 6400 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -120002,11 +143852,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120014,15 +143864,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -120043,6 +143896,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -120052,6 +143906,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -120066,48 +143921,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 765 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id035 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 912 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x32x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG64_4_1_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 64 + SubGroup1: 4 + SubGroupA: 64 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -120120,21 +143985,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 8 LSPA: 8 LSPB: 32 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -120147,11 +144016,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120159,20 +144028,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -120188,6 +144060,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -120197,6 +144070,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -120211,17 +144085,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 766 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id037 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 913 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS1_PGR1_SNLL1_TT4_4_WG32_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -120229,30 +144110,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -120265,25 +144149,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -120296,7 +144180,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -120310,13 +144194,16 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -120337,6 +144224,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -120346,6 +144234,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -120360,48 +144249,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 767 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id037 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 914 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM7 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 7 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -120414,21 +144313,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -120441,11 +144344,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120453,20 +144356,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -120482,6 +144388,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -120491,6 +144398,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -120505,17 +144413,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 768 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 915 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM15 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -120523,30 +144438,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 15 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -120559,25 +144477,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -120590,11 +144508,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120602,15 +144520,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -120631,6 +144552,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -120640,6 +144562,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -120654,48 +144577,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 769 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 916 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -120708,21 +144641,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -120735,10 +144672,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -120747,20 +144684,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -120776,6 +144716,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -120785,6 +144726,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -120799,17 +144741,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 770 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 917 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM17 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id035 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -120817,30 +144766,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 17 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -120853,25 +144805,21 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 + LSCA: 256 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -120884,11 +144832,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120898,18 +144846,21 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -120925,6 +144876,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -120934,6 +144886,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -120948,14 +144901,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 771 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id035 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 918 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM17 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 64 + SubGroup1: 4 + SubGroupA: 64 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -120966,30 +144926,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 17 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -121002,21 +144965,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -121029,7 +144996,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -121043,18 +145010,21 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -121070,6 +145040,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -121079,6 +145050,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -121093,48 +145065,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 772 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id037 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 919 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -121147,25 +145129,21 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 + LSCA: 256 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -121178,11 +145156,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121190,20 +145168,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -121219,6 +145200,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -121228,6 +145210,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -121242,17 +145225,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 773 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id037 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 920 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 64 + SubGroup1: 4 + SubGroupA: 64 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -121260,28 +145250,31 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -121296,21 +145289,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -121323,11 +145320,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121335,20 +145332,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -121364,6 +145364,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -121373,6 +145374,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -121387,46 +145389,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 774 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 921 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -121441,42 +145453,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121484,15 +145496,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -121513,6 +145528,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -121522,6 +145538,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -121536,17 +145553,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 775 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 922 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -121554,61 +145578,64 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 4 - LSPA: 4 - LSPB: 16 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 LVCA: 16 LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -121621,11 +145648,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121633,15 +145660,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -121662,6 +145692,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -121671,6 +145702,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -121685,48 +145717,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 776 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 923 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_16_1_WGM7 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id040 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 7 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 2 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -121738,43 +145780,43 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 2 - LSPA: 2 - LSPB: 32 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121784,13 +145826,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -121811,6 +145856,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -121820,6 +145866,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -121834,33 +145881,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 777 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x02_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 924 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM7 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id040 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 7 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -121873,57 +145930,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121931,15 +145988,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -121960,6 +146020,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -121969,6 +146030,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -121983,79 +146045,89 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 778 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 925 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM11 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id044 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 11 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 4 - LSPA: 4 - LSPB: 16 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 LVCA: 16 LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -122068,11 +146140,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122080,15 +146152,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -122109,6 +146184,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -122118,6 +146194,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -122132,96 +146209,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 779 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 926 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id042 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122229,15 +146316,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -122258,6 +146348,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -122267,6 +146358,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -122279,98 +146371,108 @@ TotalIndices: 4 TransposeA: false TransposeB: false - UseBeta: true - UseInitialStrides: false - SolutionIndex: 780 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 927 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id043 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 4 - LSPA: 4 - LSPB: 16 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 LVCA: 16 - LVCB: 4 + LVCB: 8 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122378,15 +146480,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -122407,6 +146512,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -122416,6 +146522,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -122430,96 +146537,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 781 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 928 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_8_2_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id042 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122527,15 +146644,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -122556,6 +146676,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -122565,6 +146686,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -122579,33 +146701,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 782 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 929 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id043 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -122618,57 +146750,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122676,15 +146808,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -122705,6 +146840,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -122714,6 +146850,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -122728,96 +146865,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 783 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 930 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122825,15 +146972,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -122854,6 +147004,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -122863,6 +147014,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -122877,33 +147029,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 784 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 931 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id043 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -122916,57 +147078,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122974,15 +147136,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -123003,6 +147168,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -123012,6 +147178,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -123026,48 +147193,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 785 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 932 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id044 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123084,23 +147261,23 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 32 + LSCB: 16 + LSPA: 16 + LSPB: 128 LVCA: 32 - LVCB: 8 - LVPA: 2 + LVCB: 4 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3360 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -123111,11 +147288,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123123,15 +147300,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -123152,6 +147332,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -123161,6 +147342,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -123175,17 +147357,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 786 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id045 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 933 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_32_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 32 + SubGroupA: 16 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -123193,30 +147382,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id046 + WorkGroup: [16, 32, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123232,24 +147424,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3360 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -123260,11 +147452,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123274,13 +147466,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -123301,6 +147496,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -123310,6 +147506,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -123324,14 +147521,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 787 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id045 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 934 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -123342,21 +147546,24 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id046 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -123365,7 +147572,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123378,27 +147585,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -123409,10 +147616,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -123421,8 +147628,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -123430,6 +147637,11 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -123441,6 +147653,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -123450,6 +147663,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -123459,6 +147673,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -123473,20 +147688,27 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 788 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 935 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id045 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -123494,27 +147716,28 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123527,27 +147750,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -123558,7 +147781,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -123572,13 +147795,16 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -123590,6 +147816,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -123599,6 +147826,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -123608,6 +147836,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -123622,77 +147851,91 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 789 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG32_08_01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 936 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id046 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 16 + LSPB: 64 LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2592 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -123703,11 +147946,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123715,26 +147958,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -123744,6 +147993,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -123753,6 +148003,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -123767,77 +148018,89 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 790 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x128x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG08_32_01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppresssNoLoadLoop: false - ThreadTile: *id047 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 937 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 32, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 2 - LSPB: 16 - LVCA: 128 - LVCB: 16 + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2592 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -123848,11 +148111,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123860,26 +148123,30 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -123889,6 +148156,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -123898,6 +148166,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -123912,92 +148181,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 791 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW02_WG32_08_01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id047 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 938 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id048 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 16 - DirectToLds: true - DirectToLdsA: true + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 + LSCA: 128 LSCB: 16 - LSPA: 1 - LSPB: 16 - LVCA: 256 - LVCB: 16 - LVPA: 1 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4640 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 256 - MacroTile1: 32 - MacroTileA: 256 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124005,26 +148288,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124034,6 +148323,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -124043,6 +148333,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -124057,33 +148348,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 792 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x16_DTL1_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT08_04_USFGRO01_VW02_WG32_08_01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id050 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 939 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id048 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -124095,39 +148394,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1600 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -124139,10 +148442,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124150,26 +148453,30 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124179,6 +148486,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -124188,6 +148496,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -124202,33 +148511,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 793 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 940 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id047 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -124240,39 +148559,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -124284,10 +148607,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124295,26 +148618,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124324,6 +148653,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -124333,6 +148663,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -124347,33 +148678,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 794 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 941 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id047 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id049 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -124385,8 +148724,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -124401,23 +148740,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -124429,10 +148772,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124440,26 +148783,30 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124469,6 +148816,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -124478,6 +148826,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -124492,77 +148841,91 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 795 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 942 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id047 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id049 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 32 + LSCB: 8 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 3344 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -124573,10 +148936,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -124585,26 +148948,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124614,6 +148983,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -124623,6 +148993,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -124637,44 +149008,54 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 796 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 943 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id050 - ThreadTile0: 8 + SuppressNoLoadLoop: true + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id049 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -124684,32 +149065,28 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 2 - LSPB: 32 - LVCA: 128 - LVCB: 8 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -124720,10 +149097,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -124732,24 +149109,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124759,6 +149144,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -124768,6 +149154,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -124782,45 +149169,54 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 797 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW01_GSU08_PGR1_PLR1_TT08_04_USFGRO01_VW01_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 944 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id051 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id052 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: true - DirectToLdsA: true + DepthU: 16 + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -124838,35 +149234,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 - LSCB: 8 - LSPA: 1 - LSPB: 32 - LVCA: 256 - LVCB: 8 - LVPA: 1 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 256 - MacroTile1: 32 - MacroTileA: 256 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124874,13 +149270,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -124892,6 +149295,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124901,6 +149305,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -124910,6 +149315,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -124924,16 +149330,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 798 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT08_04_USFGRO01_VW04_WG32_08_01_WGM08 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id051 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 945 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -124941,16 +149355,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id053 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -124962,6 +149376,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -124977,27 +149392,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 96 LSCB: 8 - LSPA: 4 + LSPA: 5 LSPB: 64 - LVCA: 64 + LVCA: 48 LVCB: 4 - LVPA: 2 + LVPA: 3 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3344 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125009,9 +149424,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -125020,13 +149435,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -125038,6 +149458,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125047,6 +149468,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -125056,6 +149478,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -125070,45 +149493,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 799 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW02_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 946 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id051 - ThreadTile0: 8 + SuppressNoLoadLoop: true + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id052 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -125123,27 +149557,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6688 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125154,10 +149588,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -125166,24 +149600,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125193,6 +149635,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -125202,6 +149645,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -125216,80 +149660,89 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 800 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW02_WG32_08_01_WGM01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id054 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 947 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id053 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 + LSCB: 16 LSPA: 8 - LSPB: 64 - LVCA: 16 - LVCB: 2 - LVPA: 2 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6688 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1600 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125300,11 +149753,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125312,24 +149765,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125339,6 +149800,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -125348,6 +149810,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -125362,80 +149825,89 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 801 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 948 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id054 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id055 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 + LSCB: 16 LSPA: 8 - LSPB: 64 - LVCA: 16 - LVCB: 2 - LVPA: 2 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6688 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1600 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125446,11 +149918,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125458,24 +149930,30 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125485,6 +149963,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -125494,6 +149973,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -125508,47 +149988,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 802 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM08 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 949 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id054 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id055 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -125561,23 +150052,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125588,11 +150083,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125602,22 +150097,30 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125627,6 +150130,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -125636,6 +150140,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -125650,47 +150155,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 803 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id051 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 950 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -125703,27 +150217,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125734,10 +150248,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -125746,13 +150260,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -125764,6 +150283,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125773,6 +150293,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -125782,6 +150303,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -125796,47 +150318,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 804 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 951 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id052 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -125849,23 +150382,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125876,11 +150413,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125890,22 +150427,30 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125915,6 +150460,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -125924,6 +150470,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -125938,47 +150485,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 805 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 952 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id054 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id052 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -125991,23 +150547,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -126018,7 +150578,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -126034,20 +150594,26 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126057,6 +150623,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -126066,6 +150633,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -126080,33 +150648,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 806 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 953 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id051 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id052 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -126118,7 +150696,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -126133,27 +150712,23 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -126165,9 +150740,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -126176,24 +150751,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126203,6 +150786,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -126212,6 +150796,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -126226,33 +150811,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 807 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 954 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id051 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id052 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -126264,7 +150857,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -126279,7 +150873,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 @@ -126291,15 +150885,15 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 7232 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -126324,11 +150918,18 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -126340,6 +150941,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126349,6 +150951,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -126358,6 +150961,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -126372,33 +150976,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 808 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id054 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 955 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id053 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -126410,6 +151022,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -126425,7 +151038,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 @@ -126437,15 +151050,15 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 7232 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -126475,154 +151088,6 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesSummation: 1 - OperationType: GEMM - SilentHighPrecisionAccumulate: false - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: false - UseBeta: true - UseInitialStrides: false - SolutionIndex: 809 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM08 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id054 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 - UnrollMemFence: false - UseSgprForGRO: false - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 4 - WorkGroup: *id053 - WorkGroupMapping: 8 - WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: false - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: true - InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 - LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 - LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 1 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MinGlobalWriteVectorWidth: 1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -126639,6 +151104,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126648,6 +151114,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -126657,6 +151124,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -126675,8 +151143,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 810 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + SolutionIndex: 956 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -126691,15 +151159,15 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -126719,8 +151187,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -126738,24 +151206,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -126767,10 +151231,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126778,12 +151242,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -126794,13 +151260,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126810,6 +151277,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -126819,6 +151287,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -126837,8 +151306,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 811 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM4 + SolutionIndex: 957 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -126846,24 +151315,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126875,16 +151342,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -126901,23 +151368,23 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 + LSCB: 16 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -126928,7 +151395,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -126942,9 +151409,11 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -126963,6 +151432,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126972,6 +151442,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -126981,6 +151452,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -126999,8 +151471,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 812 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 958 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -127015,7 +151487,7 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -127023,9 +151495,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127071,15 +151541,15 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7232 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -127092,9 +151562,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127102,12 +151572,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -127125,6 +151595,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127134,6 +151605,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -127143,6 +151615,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -127161,8 +151634,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 813 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 959 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -127171,13 +151644,13 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -127199,47 +151672,43 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -127252,11 +151721,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127264,13 +151733,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -127280,13 +151751,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127296,6 +151768,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -127305,6 +151778,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -127323,33 +151797,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 814 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM32 + SolutionIndex: 960 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127361,64 +151833,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 + LSCB: 8 + LSPA: 2 LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127427,12 +151895,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -127442,13 +151912,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127458,6 +151929,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -127467,6 +151939,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -127485,8 +151958,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 815 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW2_GSU1_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_2_WGM4 + SolutionIndex: 961 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -127494,24 +151967,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 4 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127529,43 +152000,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -127577,10 +152044,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127588,13 +152055,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -127604,13 +152073,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127620,6 +152090,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -127629,6 +152100,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -127647,33 +152119,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 816 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + SolutionIndex: 962 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127691,57 +152161,53 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 8 - LVCB: 4 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 LVPA: 2 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -127751,12 +152217,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -127766,13 +152234,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127782,6 +152251,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -127791,6 +152261,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -127809,33 +152280,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 817 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 + SolutionIndex: 963 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 4, 2] - WorkGroupMapping: 4 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127853,43 +152322,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 LVPA: 2 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -127901,10 +152366,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127912,13 +152377,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -127928,13 +152395,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127944,6 +152412,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -127953,6 +152422,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -127971,33 +152441,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 818 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + SolutionIndex: 964 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128009,14 +152477,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -128035,27 +152503,27 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCB: 8 + LSPA: 8 LSPB: 32 LVCA: 16 - LVCB: 8 - LVPA: 4 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -128074,13 +152542,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -128097,6 +152567,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128106,6 +152577,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -128115,6 +152587,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -128133,8 +152606,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 819 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 + SolutionIndex: 965 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -128154,12 +152627,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128171,7 +152642,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -128180,7 +152651,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -128197,27 +152668,27 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 16 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -128226,9 +152697,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128242,7 +152713,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -128259,6 +152730,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128268,6 +152740,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -128277,6 +152750,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -128295,8 +152769,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 820 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM4 + SolutionIndex: 966 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -128305,21 +152779,21 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -128333,14 +152807,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -128358,39 +152832,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 8 LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 LVPA: 2 - LVPB: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 640 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128398,13 +152872,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -128421,6 +152897,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128430,6 +152907,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -128439,6 +152917,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -128457,33 +152936,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 821 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 + SolutionIndex: 967 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 4, 2] - WorkGroupMapping: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128495,7 +152972,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -128504,7 +152981,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -128521,34 +152998,34 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCB: 8 + LSPA: 8 LSPB: 32 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -128560,13 +153037,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -128583,6 +153060,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128592,6 +153070,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -128601,6 +153080,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -128619,8 +153099,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 822 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 + SolutionIndex: 968 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -128640,10 +153120,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -128663,8 +153143,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -128682,24 +153162,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 LSPA: 8 - LSPB: 64 - LVCA: 32 + LSPB: 32 + LVCA: 16 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -128711,10 +153187,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128722,13 +153198,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -128738,13 +153216,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128754,6 +153233,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -128763,6 +153243,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -128781,20 +153262,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 823 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 969 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -128802,12 +153283,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128825,10 +153304,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -128846,33 +153325,29 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 32 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -128884,13 +153359,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -128900,13 +153375,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128916,6 +153392,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -128925,6 +153402,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -128943,8 +153421,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 824 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 970 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL1_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -128952,7 +153430,7 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -128964,7 +153442,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -128981,53 +153459,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1032 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -129047,11 +153521,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -129062,13 +153538,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129078,6 +153555,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -129087,6 +153565,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -129105,33 +153584,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 825 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM8 + SolutionIndex: 971 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129143,53 +153620,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 + LSCA: 64 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 32 + LVCA: 64 LVCB: 8 - LVPA: 2 - LVPB: 16 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1032 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -129197,10 +153670,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129209,11 +153682,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -129224,13 +153699,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129240,6 +153716,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -129249,6 +153726,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -129267,33 +153745,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 826 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 972 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129305,64 +153781,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 16 + LVCA: 64 LVCB: 8 LVPA: 4 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3080 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129370,12 +153846,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -129393,6 +153871,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129402,6 +153881,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -129411,6 +153891,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -129429,15 +153910,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 827 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 973 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -129445,17 +153926,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129467,49 +153946,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 776 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -129520,11 +153995,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129532,13 +154007,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129548,13 +154025,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129564,6 +154042,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -129573,6 +154052,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -129591,33 +154071,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 828 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 974 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129629,53 +154107,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 LVCB: 8 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 648 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -129684,9 +154158,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129695,12 +154169,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129710,13 +154186,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129726,6 +154203,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -129735,6 +154213,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -129753,8 +154232,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 829 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 975 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -129762,24 +154241,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129791,53 +154268,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 648 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -129846,9 +154319,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129856,13 +154329,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129872,13 +154347,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129888,6 +154364,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -129897,6 +154374,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -129915,8 +154393,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 830 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 + SolutionIndex: 976 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -129924,24 +154402,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129953,53 +154429,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 + LSCA: 64 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 32 + LVCA: 64 LVCB: 8 - LVPA: 2 - LVPB: 16 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3080 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -130007,10 +154483,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130019,11 +154495,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -130041,6 +154519,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130050,6 +154529,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -130059,6 +154539,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -130077,33 +154558,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 831 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 977 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130115,60 +154594,56 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 776 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -130180,13 +154655,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -130196,13 +154673,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130212,6 +154690,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -130221,6 +154700,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -130239,8 +154719,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 832 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 978 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -130248,24 +154728,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130277,49 +154755,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 648 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130330,11 +154804,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130342,13 +154816,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -130358,13 +154834,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130374,6 +154851,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -130383,6 +154861,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -130401,33 +154880,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 833 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM32 + SolutionIndex: 979 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130445,58 +154922,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2064 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130504,12 +154977,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -130520,13 +154995,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130536,6 +155012,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -130545,6 +155022,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -130563,33 +155041,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 834 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 + SolutionIndex: 980 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 32 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130601,53 +155077,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 1552 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -130667,11 +155139,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -130682,13 +155156,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130698,6 +155173,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -130707,6 +155183,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -130725,33 +155202,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 835 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 + SolutionIndex: 981 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 32 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130763,49 +155238,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1552 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130816,11 +155287,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130828,12 +155299,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -130844,13 +155317,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130889,8 +155363,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 836 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM2 + SolutionIndex: 982 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -130898,24 +155372,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 2 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130927,49 +155399,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 LVPA: 2 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1552 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130980,11 +155448,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130992,13 +155460,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -131008,13 +155478,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131053,33 +155524,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 837 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM2 + SolutionIndex: 983 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131091,49 +155560,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 - LSCB: 8 - LSPA: 4 - LSPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 LVCA: 64 - LVCB: 4 - LVPA: 1 - LVPB: 16 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1296 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -131144,11 +155609,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 256 - MacroTile1: 32 - MacroTileA: 256 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131156,13 +155621,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -131172,13 +155639,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131217,33 +155685,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 838 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x32x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG64_4_1_WGM2 + SolutionIndex: 984 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 64 - SubGroup1: 4 - SubGroupA: 64 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131255,49 +155721,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 4 + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 LVPA: 2 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1296 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -131308,11 +155770,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131320,13 +155782,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -131336,13 +155800,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131381,33 +155846,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 839 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS1_PGR1_SNLL1_TT4_4_WG32_8_1_WGM4 + SolutionIndex: 985 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131419,49 +155882,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2064 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -131472,10 +155931,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -131484,12 +155943,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -131500,13 +155961,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131545,33 +156007,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 840 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM7 + SolutionIndex: 986 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 7 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131583,49 +156043,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1552 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -131636,11 +156092,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131648,12 +156104,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -131664,13 +156122,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131709,33 +156168,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 841 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM15 + SolutionIndex: 987 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 15 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131747,49 +156204,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 LVPA: 2 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1296 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -131800,11 +156253,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131812,13 +156265,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -131828,13 +156283,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131873,33 +156329,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 842 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM16 + SolutionIndex: 988 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131911,49 +156365,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3104 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -131964,11 +156414,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131976,12 +156426,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -131992,13 +156444,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132037,8 +156490,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 843 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM17 + SolutionIndex: 989 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -132046,24 +156499,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 17 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132082,38 +156533,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 + LSCA: 64 LSCB: 8 - LSPA: 4 + LSPA: 2 LSPB: 16 LVCA: 64 - LVCB: 4 - LVPA: 1 - LVPB: 8 + LVCB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2176 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -132125,10 +156576,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 256 - MacroTile1: 16 - MacroTileA: 256 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132138,11 +156589,13 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -132159,6 +156612,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132197,15 +156651,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 844 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM17 + SolutionIndex: 990 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 64 - SubGroup1: 4 - SubGroupA: 64 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -132213,17 +156667,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 17 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132241,43 +156693,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -132289,10 +156737,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132300,13 +156748,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -132316,13 +156766,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132361,33 +156812,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 845 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM32 + SolutionIndex: 991 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132399,45 +156848,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 - LSCB: 8 - LSPA: 4 - LSPB: 16 + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 LVCA: 64 - LVCB: 4 - LVPA: 1 + LVCB: 16 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2176 + LdsNumElements: 1600 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -132448,11 +156897,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 256 - MacroTile1: 16 - MacroTileA: 256 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132462,11 +156911,13 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -132483,6 +156934,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132521,15 +156973,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 846 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM32 + SolutionIndex: 992 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 64 - SubGroup1: 4 - SubGroupA: 64 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -132537,17 +156989,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132559,16 +157009,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -132585,23 +157035,19 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 16 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -132612,11 +157058,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132626,11 +157072,13 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -132640,13 +157088,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132685,16 +157134,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 847 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM2 + SolutionIndex: 993 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -132706,12 +157155,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132723,16 +157170,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -132749,27 +157196,23 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 16 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -132778,9 +157221,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132794,7 +157237,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -132804,13 +157247,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132849,8 +157293,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 848 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM2 + SolutionIndex: 994 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -132858,22 +157302,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -132887,47 +157331,43 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 16 LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -132940,11 +157380,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132952,12 +157392,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -132968,7 +157408,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -133013,8 +157453,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 849 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_16_1_WGM7 + SolutionIndex: 995 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -133022,22 +157462,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 7 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -133051,64 +157491,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 8 LVCB: 4 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 128 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 2 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133116,8 +157552,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -133132,8 +157568,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -133177,31 +157613,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 850 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM7 + SolutionIndex: 996 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 7 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -133221,58 +157657,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133280,8 +157712,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -133296,8 +157728,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -133341,29 +157773,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 851 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM11 + SolutionIndex: 997 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 11 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -133385,41 +157817,37 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -133433,10 +157861,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133444,8 +157872,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -133460,7 +157888,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -133505,8 +157933,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 852 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM16 + SolutionIndex: 998 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -133514,20 +157942,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -133549,41 +157977,37 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -133597,10 +158021,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133608,13 +158032,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -133624,8 +158048,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -133669,29 +158093,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 853 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 + SolutionIndex: 999 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 32 + VectorWidth: 2 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -133713,9 +158137,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -133723,31 +158147,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 16 + LSPB: 16 + LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -133761,10 +158181,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133772,13 +158192,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -133788,7 +158208,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -133833,29 +158253,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 854 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_8_2_WGM64 + SolutionIndex: 1000 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -133871,7 +158291,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -133879,56 +158299,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 16 + LSPB: 16 + LVCA: 8 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133936,12 +158356,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -133997,31 +158417,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 855 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 + SolutionIndex: 1001 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 32 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -134035,15 +158455,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -134051,48 +158471,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134100,8 +158516,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -134116,7 +158532,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -134161,31 +158577,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 856 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM8 + SolutionIndex: 1002 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -134205,41 +158621,37 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -134253,10 +158665,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134264,8 +158676,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -134280,8 +158692,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -134325,8 +158737,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 857 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM1 + SolutionIndex: 1003 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -134334,20 +158746,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -134369,41 +158781,37 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -134417,9 +158825,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -134428,12 +158836,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -134444,8 +158852,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -134489,8 +158897,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 858 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM1 + SolutionIndex: 1004 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -134498,20 +158906,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -134533,41 +158941,37 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 128 - LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -134581,10 +158985,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134592,13 +158996,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -134608,7 +159012,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -134653,29 +159057,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 859 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_32_1_WGM1 + SolutionIndex: 1005 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 32, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -134697,58 +159101,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134756,8 +159156,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -134772,8 +159172,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -134817,28 +159217,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 860 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM8 + SolutionIndex: 1006 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -134855,15 +159255,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -134871,37 +159271,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -134909,10 +159305,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134920,10 +159316,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -134938,14 +159332,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -134984,31 +159377,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 861 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1007 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -135020,7 +159415,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -135028,7 +159423,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -135036,23 +159431,23 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 @@ -135066,7 +159461,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -135074,10 +159469,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135085,11 +159480,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -135108,7 +159503,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -135147,31 +159541,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 862 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1008 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -135185,15 +159579,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -135201,37 +159595,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -135239,10 +159629,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135250,15 +159640,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -135268,14 +159656,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -135314,31 +159701,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 863 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1009 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -135350,7 +159739,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -135358,7 +159747,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -135366,37 +159755,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -135404,10 +159793,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135415,13 +159804,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -135438,7 +159827,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -135477,31 +159865,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 864 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1010 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -135515,64 +159903,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135580,15 +159964,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -135598,14 +159980,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -135644,31 +160025,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 865 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1011 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -135680,7 +160063,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -135688,56 +160071,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135745,13 +160128,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -135768,7 +160151,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -135807,31 +160189,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 866 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1012 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB0_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -135851,43 +160233,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1056 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -135899,10 +160277,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135910,14 +160288,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -135928,14 +160304,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -135974,8 +160349,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 867 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1013 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -135983,22 +160358,24 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -136016,43 +160393,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1568 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -136064,10 +160437,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136075,11 +160448,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -136091,14 +160464,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -136137,8 +160509,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 868 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1014 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -136146,20 +160518,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -136175,47 +160547,43 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 8 - LSPA: 8 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3344 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1568 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -136228,10 +160596,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 96 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -136240,11 +160608,9 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 @@ -136258,14 +160624,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -136304,8 +160669,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 869 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1015 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -136313,14 +160678,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [6, 4] - ThreadTile0: 6 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -136328,7 +160693,9 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -136347,53 +160714,53 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 - LSPA: 4 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 544 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136401,15 +160768,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -136426,7 +160791,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -136465,31 +160829,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 870 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1016 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -136508,53 +160874,53 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 - LSPA: 4 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136562,14 +160928,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -136587,7 +160951,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -136626,31 +160989,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 871 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 1017 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -136662,7 +161027,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -136687,39 +161052,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 96 - LSCB: 8 - LSPA: 5 - LSPB: 64 - LVCA: 48 - LVCB: 4 - LVPA: 3 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3344 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136727,11 +161092,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -136750,7 +161115,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -136789,32 +161153,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 872 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1018 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -136827,14 +161191,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -136853,21 +161217,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6688 - LdsNumElementsAlignedA: 1536 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -136880,11 +161244,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136892,13 +161256,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -136911,13 +161273,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -136956,8 +161317,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 873 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1019 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -136966,21 +161327,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -136992,14 +161355,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -137012,44 +161375,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6688 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1600 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137057,14 +161420,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -137082,7 +161443,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -137121,31 +161481,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 874 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1020 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -137157,13 +161519,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -137177,44 +161539,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6688 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1600 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137222,12 +161580,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -137238,14 +161596,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -137284,31 +161641,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 875 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1021 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -137322,15 +161679,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -137338,37 +161695,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -137376,10 +161733,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137387,10 +161744,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -137412,7 +161767,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -137451,31 +161805,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 876 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1022 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -137487,7 +161843,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -137495,7 +161851,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -137503,25 +161859,25 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3616 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 @@ -137529,11 +161885,11 @@ LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -137541,10 +161897,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137552,11 +161908,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -137569,13 +161925,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -137614,31 +161969,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 877 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1023 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -137652,15 +162007,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -137668,37 +162023,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -137706,10 +162057,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137717,13 +162068,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -137735,14 +162084,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -137781,31 +162129,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 878 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1024 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -137817,7 +162167,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -137825,7 +162175,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -137833,25 +162183,25 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3616 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 @@ -137859,11 +162209,11 @@ LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -137871,10 +162221,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137882,11 +162232,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -137905,7 +162255,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -137944,31 +162293,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 879 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1025 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -137982,60 +162331,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138043,14 +162396,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -138061,14 +162412,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -138107,31 +162457,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 880 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1026 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -138143,64 +162495,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138208,14 +162556,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -138226,14 +162572,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -138272,31 +162617,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 881 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1027 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -138308,7 +162655,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -138316,56 +162663,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138373,12 +162720,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -138396,7 +162743,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -138435,31 +162781,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 882 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1028 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -138473,60 +162819,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138534,15 +162884,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -138552,14 +162900,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -138598,31 +162945,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 883 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1029 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -138634,39 +162983,39 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 8 - LSPB: 64 + LSPB: 16 LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 7232 LdsNumElementsAlignedA: 2048 @@ -138676,7 +163025,7 @@ LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -138687,11 +163036,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138699,14 +163048,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -138724,7 +163071,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -138763,8 +163109,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 884 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1030 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -138773,21 +163119,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -138799,64 +163147,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138864,11 +163208,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -138880,14 +163224,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -138926,31 +163269,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 885 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1031 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -138964,49 +163307,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -139014,10 +163361,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -139025,15 +163372,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -139043,14 +163388,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -139089,31 +163433,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 886 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1032 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -139125,49 +163471,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 32 + LSCB: 32 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -139175,9 +163521,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -139186,15 +163532,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -139205,13 +163549,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -139250,14 +163593,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 887 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1033 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 2] @@ -139266,15 +163609,17 @@ ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -139286,49 +163631,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 32 + LSCB: 32 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -139336,9 +163685,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -139347,15 +163696,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -139365,14 +163712,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -139411,31 +163757,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 888 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1034 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -139447,60 +163795,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 LVCB: 16 - LVPA: 2 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -139508,15 +163860,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -139526,14 +163876,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -139572,31 +163921,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 889 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1035 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -139608,60 +163959,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 LVCB: 16 - LVPA: 2 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -139669,15 +164024,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -139687,14 +164040,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -139733,31 +164085,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 890 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1036 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -139769,15 +164123,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -139785,37 +164139,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 832 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -139823,10 +164173,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -139834,13 +164184,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -139852,14 +164200,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -139898,31 +164245,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 891 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 1037 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -139934,7 +164283,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -139942,7 +164291,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -139950,37 +164299,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 1856 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -139988,10 +164337,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -139999,11 +164348,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -140022,7 +164371,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -140061,31 +164409,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 892 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 1038 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -140099,64 +164447,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 16 - LVCB: 2 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3136 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -140164,14 +164512,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -140189,7 +164535,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -140228,31 +164573,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 893 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 1039 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -140264,7 +164611,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -140272,7 +164619,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -140280,37 +164627,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 3136 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -140318,10 +164665,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -140329,13 +164676,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -140346,13 +164693,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -140391,31 +164737,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 894 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 1040 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -140429,60 +164775,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -140490,15 +164840,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -140508,14 +164856,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -140554,31 +164901,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 895 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 1041 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -140590,59 +164939,63 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -140651,13 +165004,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -140667,14 +165020,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -140713,31 +165065,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 896 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL1_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 1042 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -140751,49 +165103,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1032 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -140801,10 +165157,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -140812,15 +165168,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -140830,14 +165184,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -140876,31 +165229,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 897 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1043 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -140912,49 +165267,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1032 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -140962,10 +165321,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -140973,15 +165332,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -140991,14 +165348,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -141037,31 +165393,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 898 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1044 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -141073,41 +165431,41 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3080 + LdsNumElements: 3136 LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 @@ -141115,11 +165473,11 @@ LdsOffsetB: 512 LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -141127,10 +165485,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141138,15 +165496,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -141163,7 +165519,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -141202,31 +165557,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 899 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1045 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -141238,49 +165595,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 776 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -141288,10 +165649,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141299,15 +165660,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -141317,14 +165676,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -141363,31 +165721,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 900 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1046 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -141399,49 +165759,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 648 + LdsNumElements: 1088 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -141449,9 +165809,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -141460,15 +165820,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -141479,13 +165837,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -141524,31 +165881,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 901 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1047 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -141560,49 +165919,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 648 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -141610,9 +165973,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -141621,15 +165984,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -141639,14 +166000,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -141685,31 +166045,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 902 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1048 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -141721,53 +166083,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3080 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -141775,10 +166137,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141786,14 +166148,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -141805,13 +166165,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -141850,31 +166209,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 903 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1049 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -141886,49 +166247,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 32 + LSCB: 32 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 776 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -141936,10 +166301,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141947,15 +166312,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -141965,14 +166328,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -142011,31 +166373,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 904 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1050 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -142047,49 +166411,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 648 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -142097,10 +166465,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142108,15 +166476,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142126,14 +166492,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -142172,31 +166537,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 905 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1051 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -142208,60 +166575,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2064 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142269,15 +166640,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142287,14 +166656,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -142333,31 +166701,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 906 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1052 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -142369,60 +166739,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1552 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142430,15 +166804,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142448,14 +166820,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -142494,31 +166865,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 907 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1053 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -142537,38 +166910,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 - LSPA: 4 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1552 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -142580,9 +166953,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -142591,15 +166964,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142610,13 +166981,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -142655,31 +167025,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 908 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1054 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -142698,38 +167070,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1552 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -142741,10 +167113,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142752,13 +167124,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 NumThreads: 128 PackBatchDims: 0 @@ -142777,7 +167147,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -142816,31 +167185,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 909 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1055 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -142859,53 +167230,53 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 8 LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LSPA: 16 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1296 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142913,15 +167284,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142938,7 +167307,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -142977,31 +167345,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 910 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1056 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -143020,53 +167390,53 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 8 LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LSPA: 16 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1296 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143074,15 +167444,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -143099,7 +167467,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -143138,31 +167505,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 911 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1057 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -143181,52 +167550,52 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 4 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 + LSPA: 16 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2064 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 64 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 4 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 4 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -143235,14 +167604,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -143254,13 +167621,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -143299,31 +167665,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 912 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1058 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [2, 32, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -143335,60 +167703,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1552 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 64 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 64 + MacroTileA: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143396,14 +167764,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -143421,7 +167787,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -143460,31 +167825,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 913 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1059 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -143503,38 +167870,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1296 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -143546,10 +167913,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143557,13 +167924,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 @@ -143576,13 +167941,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -143621,31 +167985,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 914 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1060 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -143657,59 +168023,59 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 4 - LVPB: 8 + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3104 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -143718,14 +168084,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -143737,13 +168101,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -143782,31 +168145,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 915 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1061 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -143818,59 +168183,59 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 - LSPB: 16 - LVCA: 64 + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 4 LVCB: 8 - LVPA: 2 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -143879,15 +168244,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -143904,7 +168267,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -143943,31 +168305,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 916 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 1062 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -143979,60 +168343,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 - LSPB: 16 - LVCA: 64 + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 4 LVCB: 8 - LVPA: 2 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 64 + MacroTileA: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144040,15 +168404,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -144059,13 +168421,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -144104,31 +168465,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 917 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 1063 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -144147,53 +168510,53 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 4 LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LSPA: 16 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1600 + LdsNumElements: 1120 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 64 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 4 + MacroTile1: 64 + MacroTileA: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144201,15 +168564,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -144220,13 +168581,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -144265,31 +168625,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 918 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 1064 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [2, 32, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -144301,15 +168663,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -144317,44 +168679,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 + LSCA: 4 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 + LVCA: 2 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 1120 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 64 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 4 + MacroTile1: 64 + MacroTileA: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144362,15 +168724,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -144387,7 +168747,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -144426,31 +168785,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 919 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 1065 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [2, 32, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -144462,15 +168823,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -144478,44 +168839,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 + LSCA: 4 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 + LVCA: 2 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 3168 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 64 + LdsOffsetB_Blk: 2112 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 4 + MacroTile1: 64 + MacroTileA: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144523,13 +168888,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -144539,14 +168904,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -144585,31 +168949,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 920 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 1066 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [2, 32, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -144623,13 +168987,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -144643,40 +169007,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 + LSCA: 4 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCA: 2 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 3168 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 64 + LdsOffsetB_Blk: 2112 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 4 + MacroTile1: 64 + MacroTileA: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144684,12 +169052,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -144700,7 +169068,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -144745,16 +169113,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 921 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 1067 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: true ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 @@ -144766,10 +169134,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [2, 32, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -144783,7 +169151,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -144808,20 +169176,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 8 - LVCB: 4 - LVPA: 4 + LVCA: 4 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 1344 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -144832,11 +169200,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144849,7 +169217,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -144861,7 +169229,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -144905,15 +169273,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 922 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1068 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -144926,10 +169294,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -144943,13 +169311,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -144968,34 +169336,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -145004,12 +169376,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -145020,8 +169392,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -145065,20 +169437,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 923 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 1069 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -145086,10 +169458,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -145103,13 +169475,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -145123,40 +169495,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -145164,8 +169540,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -145180,8 +169556,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -145225,20 +169601,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 924 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 1070 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -145246,10 +169622,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 8, 8] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -145263,13 +169639,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -145283,36 +169659,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 MacroTile1: 16 MacroTileA: 16 @@ -145324,13 +169704,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -145340,8 +169720,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -145385,20 +169765,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 925 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + SolutionIndex: 1071 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -145406,10 +169786,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] + WorkGroup: [4, 8, 8] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -145423,7 +169803,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -145443,40 +169823,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 2048 LdsOffsetA: 0 LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -145484,13 +169864,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -145545,31 +169925,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 926 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + SolutionIndex: 1072 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] + WorkGroup: [4, 8, 8] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -145583,7 +169963,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -145608,28 +169988,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3392 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 + LdsOffsetA_Blk: 2048 LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -145637,10 +170017,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -145653,7 +170033,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -145709,31 +170089,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 927 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1073 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [4, 8, 8] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -145747,7 +170127,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -145767,25 +170147,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 4 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 2240 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -145796,11 +170176,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 4 + MacroTile1: 64 + MacroTileA: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -145808,12 +170188,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -145869,31 +170249,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 928 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1074 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [2, 32, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -145907,13 +170287,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -145932,34 +170312,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -145968,12 +170352,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -145984,8 +170368,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -146029,20 +170413,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 929 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM8 + SolutionIndex: 1075 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -146050,10 +170434,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -146067,13 +170451,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -146087,40 +170471,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146128,12 +170516,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -146144,8 +170532,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -146189,31 +170577,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 930 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1076 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 8, 8] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -146227,7 +170615,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -146247,40 +170635,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146288,8 +170676,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -146349,15 +170737,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 931 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1077 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [2, 4] ThreadTile0: 2 @@ -146370,10 +170758,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 8, 8] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -146387,13 +170775,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -146412,24 +170800,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -146437,10 +170829,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146453,7 +170845,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -146464,8 +170856,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -146509,31 +170901,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 932 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1078 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] + SuppressNoLoadLoop: true + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [4, 8, 8] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -146553,7 +170945,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -146572,35 +170964,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 8 LSCB: 32 LSPA: 32 LSPB: 16 - LVCA: 8 + LVCA: 4 LVCB: 16 LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146613,7 +171009,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -146624,8 +171020,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -146669,20 +171065,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 933 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1079 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG2_16_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -146690,8 +171086,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [2, 16, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -146727,7 +171123,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -146735,15 +171131,15 @@ LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3328 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -146762,9 +171158,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146772,7 +171168,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -146833,20 +171229,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 934 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1080 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -146854,7 +171250,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -146877,7 +171273,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -146896,20 +171292,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 16 LSPB: 8 - LVCA: 8 + LVCA: 16 LVCB: 16 LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -146921,9 +171321,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 8 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -146938,7 +171338,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -146948,7 +171348,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -146993,16 +171393,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 935 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1081 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 4 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 4 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 @@ -147014,7 +171414,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -147051,29 +171451,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPB: 16 + LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -147085,10 +171485,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147096,13 +171496,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -147157,28 +171557,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 936 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1082 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 4 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -147201,7 +171601,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -147215,40 +171615,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPB: 16 + LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147256,13 +171660,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -147272,7 +171676,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -147317,16 +171721,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 937 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1083 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 @@ -147338,8 +171742,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -147372,47 +171776,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPB: 16 + LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147420,13 +171824,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -147481,28 +171885,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 938 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB0_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1084 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -147519,13 +171923,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -147539,40 +171943,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1056 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147580,11 +171988,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -147596,7 +172004,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -147641,20 +172049,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 939 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 1085 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -147662,10 +172070,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 4, 8] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -147679,13 +172087,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -147699,40 +172107,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1568 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147740,12 +172152,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -147756,8 +172168,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -147801,16 +172213,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 940 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1086 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: true ThreadTile: [2, 4] ThreadTile0: 2 ThreadTile1: 4 @@ -147822,10 +172234,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 4, 8] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -147839,13 +172251,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -147859,40 +172271,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1568 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147900,12 +172316,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -147916,7 +172332,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -147961,31 +172377,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 941 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1087 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -147999,13 +172415,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -148024,24 +172440,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 544 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -148049,9 +172469,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -148064,9 +172484,9 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -148076,7 +172496,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -148121,31 +172541,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 942 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + SolutionIndex: 1088 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -148159,13 +172579,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -148184,34 +172604,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 + LSCA: 64 + LSCB: 32 + LSPA: 8 LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -148220,11 +172644,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -148236,7 +172660,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -148281,16 +172705,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 943 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1089 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 @@ -148302,10 +172726,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -148319,7 +172743,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -148339,33 +172763,33 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCA: 8 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3136 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -148373,9 +172797,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 32 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -148384,7 +172808,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -148445,31 +172869,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 944 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1090 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -148508,38 +172932,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 16 - LVCA: 16 + LSPB: 32 + LVCA: 32 LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -148553,8 +172977,8 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -148609,16 +173033,180 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 945 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 1091 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG32_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [32, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1092 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 4 + SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 @@ -148630,7 +173218,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -148664,7 +173252,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -148693,14 +173281,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 MacroTile1: 16 MacroTileA: 16 @@ -148773,28 +173361,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 946 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1093 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 4, 8] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -148828,7 +173416,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -148845,7 +173433,7 @@ LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 2048 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 @@ -148853,14 +173441,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 MacroTile1: 16 MacroTileA: 16 @@ -148933,28 +173521,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 947 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1094 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 4, 8] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -148988,7 +173576,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -149017,14 +173605,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 MacroTile1: 16 MacroTileA: 16 @@ -149097,28 +173685,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 948 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1095 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 4, 8] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -149152,46 +173740,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 8 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -149200,11 +173788,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -149217,7 +173805,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -149261,29 +173849,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 949 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1096 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -149299,13 +173887,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -149324,24 +173912,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -149349,10 +173941,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149360,11 +173952,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -149376,7 +173968,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -149421,31 +174013,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 950 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1097 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -149459,7 +174051,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -149484,28 +174076,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -149513,10 +174105,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149524,11 +174116,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -149585,31 +174177,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 951 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1098 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -149623,13 +174215,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -149648,28 +174240,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -149677,10 +174265,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149688,12 +174276,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -149704,7 +174292,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -149749,20 +174337,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 952 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1099 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -149770,10 +174358,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -149787,7 +174375,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -149812,35 +174400,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149848,11 +174436,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -149865,7 +174453,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -149909,20 +174497,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 953 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1100 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -149930,10 +174518,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -149947,7 +174535,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -149955,56 +174543,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150012,12 +174600,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -150073,31 +174661,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 954 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1101 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -150111,7 +174699,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -150119,56 +174707,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150176,13 +174764,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -150237,31 +174825,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 955 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1102 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -150283,41 +174871,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -150330,9 +174918,9 @@ LoopTail: true LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150340,11 +174928,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -150401,8 +174989,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 956 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM8 + SolutionIndex: 1103 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -150411,19 +174999,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -150439,13 +175027,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -150459,29 +175047,33 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 3088 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -150489,10 +175081,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150500,8 +175092,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -150516,7 +175108,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -150561,31 +175153,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 957 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1104 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -150599,13 +175191,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -150619,33 +175211,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1040 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -150653,10 +175241,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150664,8 +175252,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -150680,7 +175268,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -150725,31 +175313,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 958 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1105 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -150763,13 +175351,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -150788,24 +175376,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3088 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -150813,10 +175405,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150824,11 +175416,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -150840,7 +175432,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -150885,31 +175477,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 959 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1106 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -150923,7 +175515,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -150948,16 +175540,16 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPB: 32 + LVCA: 32 + LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 8192 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 @@ -150976,11 +175568,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150988,13 +175580,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -151049,31 +175641,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 960 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1107 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151087,13 +175679,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -151112,39 +175704,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151152,11 +175740,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -151168,8 +175756,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -151213,20 +175801,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 961 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1108 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -151234,10 +175822,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151251,7 +175839,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -151276,22 +175864,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -151304,10 +175892,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 + LoopUnroll: 4 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -151316,13 +175904,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -151377,20 +175965,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 962 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1109 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -151398,10 +175986,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151415,7 +176003,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -151435,21 +176023,21 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 + LSPB: 64 + LVCA: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 832 + LdsNumElements: 8192 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 @@ -151464,11 +176052,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151476,13 +176064,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -151493,7 +176081,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -151537,31 +176125,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 963 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1110 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG8_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [8, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151575,7 +176163,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -151595,27 +176183,27 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 + LSPB: 64 + LVCA: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1856 + LdsNumElements: 8192 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -151628,11 +176216,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151640,13 +176228,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -151701,31 +176289,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 964 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1111 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [8, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151739,13 +176327,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -151756,7 +176344,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -151764,28 +176352,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -151793,10 +176377,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151804,13 +176388,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -151820,8 +176404,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -151865,31 +176449,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 965 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + SolutionIndex: 1112 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151903,13 +176487,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -151920,47 +176504,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151968,12 +176548,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -151984,8 +176564,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -152029,31 +176609,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 966 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1113 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -152084,30 +176664,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 LSPA: 32 - LSPB: 16 - LVCA: 8 + LSPB: 64 + LVCA: 32 LVCB: 16 LVPA: 16 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -152121,10 +176701,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -152132,13 +176712,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -152193,28 +176773,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 967 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1114 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -152248,7 +176828,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -152256,22 +176836,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 + LSPA: 16 + LSPB: 32 + LVCA: 32 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -152285,9 +176865,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -152296,13 +176876,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -152357,20 +176937,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 968 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1115 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -152378,7 +176958,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -152412,47 +176992,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 32 LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPB: 64 + LVCA: 64 LVCB: 16 LVPA: 8 - LVPB: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -152460,13 +177040,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -152477,7 +177057,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -152521,28 +177101,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 969 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1116 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG32_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 32 + SubGroup1: 16 + SubGroupA: 32 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [32, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -152576,30 +177156,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPA: 32 + LSPB: 64 + LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 4 + LVPA: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -152613,10 +177193,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -152624,13 +177204,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -152685,29 +177265,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 970 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1117 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -152740,7 +177320,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -152748,22 +177328,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPB: 32 + LVCA: 32 LVCB: 16 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -152777,10 +177357,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -152788,13 +177368,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -152849,20 +177429,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 971 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_4_WGM1 + SolutionIndex: 1118 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -152870,8 +177450,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -152887,7 +177467,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -152895,56 +177475,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -152952,8 +177532,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -152969,7 +177549,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -153013,31 +177593,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 972 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1119 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -153051,60 +177631,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153112,11 +177696,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -153128,7 +177712,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -153173,31 +177757,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 973 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1120 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -153211,64 +177795,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153276,8 +177856,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -153292,8 +177872,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -153337,31 +177917,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 974 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1121 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -153375,7 +177955,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -153383,56 +177963,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153440,11 +178020,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -153457,7 +178037,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -153501,31 +178081,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 975 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1122 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -153539,7 +178119,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -153547,56 +178127,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 7232 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153604,12 +178184,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -153665,31 +178245,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 976 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1123 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -153703,7 +178283,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -153711,56 +178291,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153768,12 +178348,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -153829,31 +178409,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 977 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1124 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -153875,56 +178455,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153932,13 +178512,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -153949,7 +178529,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -153993,29 +178573,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 978 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1125 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -154039,7 +178619,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -154047,33 +178627,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPA: 32 + LSPB: 32 + LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -154085,10 +178665,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154096,13 +178676,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154157,29 +178737,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 979 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1126 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -154195,60 +178775,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 + LSCA: 64 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154256,13 +178840,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154272,8 +178856,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -154317,31 +178901,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 980 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 + SolutionIndex: 1127 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -154355,15 +178939,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -154371,44 +178955,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 128 + LSCB: 32 LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPB: 32 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154417,12 +179005,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154432,7 +179020,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -154477,31 +179065,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 981 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_16_1_WGM1 + SolutionIndex: 1128 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -154515,45 +179103,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 4 + LSCA: 64 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -154564,11 +179156,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154576,13 +179168,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154592,8 +179184,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -154637,31 +179229,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 982 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1129 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [8, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -154675,15 +179267,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -154691,44 +179283,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 4 - LVCB: 8 + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 32 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154736,13 +179332,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154752,7 +179348,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -154797,31 +179393,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 983 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1130 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 32 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 32 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [32, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -154835,15 +179431,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -154851,29 +179447,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 16 - LSPA: 16 + LSCA: 64 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 2 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 64 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -154884,11 +179484,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 - MacroTile1: 64 - MacroTileA: 4 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154896,13 +179496,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154912,8 +179512,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -154957,31 +179557,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 984 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1131 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 32, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -154995,45 +179595,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 + LSCA: 64 + LSCB: 32 + LSPA: 32 LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 64 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -155044,10 +179648,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 8 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -155056,13 +179660,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155072,7 +179676,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -155117,31 +179721,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 985 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1132 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155155,15 +179759,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -155171,33 +179775,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 32 + LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -155205,10 +179813,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -155216,13 +179824,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularB: 1 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155232,8 +179840,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -155277,31 +179885,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 986 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 + SolutionIndex: 1133 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 32 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155315,60 +179923,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 LSPB: 32 - LVCA: 4 + LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -155376,12 +179988,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -155392,8 +180004,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -155437,31 +180049,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 987 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1134 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155475,15 +180087,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -155491,29 +180103,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 16 - LSPA: 16 + LSCA: 64 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -155524,10 +180140,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 8 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -155536,13 +180152,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155552,8 +180168,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -155597,31 +180213,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 988 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1135 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR0_SNLL1_TT4_4_VW4_WG16_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155635,45 +180251,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 4 + LSCA: 64 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -155684,10 +180304,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 8 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -155696,13 +180316,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155712,8 +180332,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -155757,31 +180377,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 989 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1136 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155795,45 +180415,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 2 + LSCA: 64 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1120 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 64 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -155844,10 +180468,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 4 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -155856,13 +180480,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155872,8 +180496,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -155917,31 +180541,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 990 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1137 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 32, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155955,15 +180579,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -155971,43 +180595,47 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 2 - LVCB: 8 + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 32 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1120 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 64 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 4 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -156016,13 +180644,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -156032,7 +180660,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -156077,31 +180705,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 991 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1138 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 32 + SubGroup1: 16 + SubGroupA: 32 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 32, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -156115,13 +180743,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -156135,29 +180763,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 2 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3168 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 64 - LdsOffsetB_Blk: 2112 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -156168,10 +180792,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 + LoopUnroll: 2 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 4 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -156180,12 +180804,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -156196,8 +180820,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -156241,31 +180865,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 992 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1139 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [2, 32, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -156279,13 +180903,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -156299,29 +180923,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 2 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3168 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 64 - LdsOffsetB_Blk: 2112 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -156332,10 +180952,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 + LoopUnroll: 2 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 4 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -156344,12 +180964,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -156360,7 +180980,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -156405,31 +181025,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 993 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM8 + SolutionIndex: 1140 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [2, 32, 4] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -156449,54 +181069,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1344 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156504,11 +181128,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -156520,7 +181144,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -156565,29 +181189,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 994 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1141 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -156603,13 +181227,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -156628,39 +181252,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156668,12 +181288,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -156684,7 +181304,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -156729,31 +181349,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 995 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1142 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -156767,13 +181387,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -156787,33 +181407,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -156821,10 +181437,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156832,8 +181448,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -156848,8 +181464,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -156893,31 +181509,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 996 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 + SolutionIndex: 1143 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -156931,7 +181547,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -156951,33 +181567,33 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -156985,10 +181601,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156996,8 +181612,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -157057,31 +181673,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 997 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 + SolutionIndex: 1144 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -157095,7 +181711,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -157115,40 +181731,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -157156,8 +181772,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -157217,20 +181833,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 998 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM1 + SolutionIndex: 1145 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -157238,10 +181854,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 8, 8] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -157255,7 +181871,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -157275,44 +181891,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -157320,8 +181936,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -157381,20 +181997,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 999 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM1 + SolutionIndex: 1146 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -157402,10 +182018,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 8, 8] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -157419,7 +182035,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -157439,23 +182055,23 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 2 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2240 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -157468,10 +182084,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 4 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 4 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -157480,12 +182096,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -157541,31 +182157,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1000 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1147 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [2, 32, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -157579,13 +182195,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -157604,28 +182220,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -157633,10 +182245,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -157644,8 +182256,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -157660,7 +182272,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -157705,31 +182317,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1001 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM8 + SolutionIndex: 1148 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -157743,7 +182355,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -157763,23 +182375,23 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 @@ -157789,18 +182401,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -157808,12 +182420,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -157869,31 +182481,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1002 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM8 + SolutionIndex: 1149 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 8, 8] + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -157907,13 +182519,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -157927,40 +182539,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -157968,8 +182584,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -157984,7 +182600,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -158029,20 +182645,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1003 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM8 + SolutionIndex: 1150 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -158050,10 +182666,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -158087,27 +182703,27 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 16 LSCB: 32 LSPA: 32 LSPB: 16 - LVCA: 4 + LVCA: 8 LVCB: 16 LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -158121,9 +182737,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 8 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -158132,7 +182748,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -158193,8 +182809,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1004 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM8 + SolutionIndex: 1151 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -158203,10 +182819,10 @@ SubGroupA: 4 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -158231,7 +182847,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -158239,56 +182855,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -158296,12 +182912,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -158357,31 +182973,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1005 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG2_16_8_WGM8 + SolutionIndex: 1152 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 2 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 16, 8] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -158395,7 +183011,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -158403,45 +183019,45 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 8192 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -158449,10 +183065,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -158460,11 +183076,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -158521,31 +183137,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1006 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 + SolutionIndex: 1153 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -158559,7 +183175,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -158567,45 +183183,45 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 8192 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -158613,10 +183229,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -158624,11 +183240,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -158685,31 +183301,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1007 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 + SolutionIndex: 1154 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -158731,56 +183347,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -158788,12 +183404,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -158849,28 +183465,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1008 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM1 + SolutionIndex: 1155 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -158895,41 +183511,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -158942,9 +183558,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -158952,12 +183568,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -159013,29 +183629,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1009 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_8_2_WGM1 + SolutionIndex: 1156 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -159059,52 +183675,52 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -159117,11 +183733,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -159177,29 +183793,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1010 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_8_2_WGM1 + SolutionIndex: 1157 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -159215,7 +183831,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -159235,33 +183851,33 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -159270,9 +183886,9 @@ LoopTail: true LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -159280,11 +183896,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -159341,31 +183957,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1011 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_4_8_WGM1 + SolutionIndex: 1158 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 8] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -159379,13 +183995,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -159399,33 +184015,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -159433,10 +184045,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -159444,8 +184056,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -159460,7 +184072,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -159505,20 +184117,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1012 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM1 + SolutionIndex: 1159 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -159526,10 +184138,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -159543,7 +184155,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -159563,27 +184175,27 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -159596,11 +184208,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -159608,11 +184220,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -159669,31 +184281,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1013 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM8 + SolutionIndex: 1160 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -159741,13 +184353,13 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -159762,9 +184374,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -159772,12 +184384,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -159833,20 +184445,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1014 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM8 + SolutionIndex: 1161 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -159854,8 +184466,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -159896,39 +184508,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -159940,8 +184552,8 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -159997,28 +184609,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1015 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_8_2_WGM8 + SolutionIndex: 1162 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -160055,44 +184667,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -160100,12 +184712,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -160161,20 +184773,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1016 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM8 + SolutionIndex: 1163 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -160182,8 +184794,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -160199,7 +184811,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -160207,7 +184819,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -160215,33 +184827,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 32 - LVCB: 16 - LVPA: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -160252,7 +184864,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -160264,13 +184876,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -160325,31 +184937,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1017 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG32_8_2_WGM8 + SolutionIndex: 1164 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -160363,7 +184975,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -160371,7 +184983,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -160379,37 +184991,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -160417,10 +185029,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -160428,11 +185040,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -160445,7 +185057,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -160489,31 +185101,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1018 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG16_4_4_WGM1 + SolutionIndex: 1165 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -160527,7 +185139,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -160535,7 +185147,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -160543,48 +185155,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -160592,8 +185204,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -160609,7 +185221,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -160653,31 +185265,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1019 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 + SolutionIndex: 1166 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -160691,60 +185303,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -160752,8 +185368,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -160768,7 +185384,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -160813,31 +185429,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1020 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 + SolutionIndex: 1167 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -160859,56 +185475,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -160916,11 +185532,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -160977,28 +185593,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1021 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 + SolutionIndex: 1168 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -161023,7 +185639,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -161031,48 +185647,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 + LSPA: 16 + LSPB: 32 + LVCA: 32 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161080,13 +185696,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -161141,29 +185757,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1022 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM8 + SolutionIndex: 1169 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG32_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 32 SubGroup1: 4 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [32, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -161179,7 +185795,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -161187,45 +185803,45 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -161233,10 +185849,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161244,8 +185860,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -161305,15 +185921,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1023 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1170 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -161325,11 +185941,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -161343,7 +185959,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -161351,56 +185967,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161408,11 +186024,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -161469,15 +186085,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1024 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1171 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -161489,11 +186105,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -161507,7 +186123,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -161515,41 +186131,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -161557,10 +186173,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161568,8 +186184,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -161585,7 +186201,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -161629,15 +186245,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1025 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1172 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR1_SNLL0_TT4_4_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -161649,11 +186265,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -161667,60 +186283,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 + LSCA: 32 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161728,12 +186348,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -161744,8 +186364,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -161789,16 +186409,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1026 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1173 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -161809,11 +186429,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -161827,7 +186447,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -161852,39 +186472,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161892,8 +186512,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -161953,15 +186573,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1027 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1174 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [8, 4] ThreadTile0: 8 @@ -161974,10 +186594,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -161991,7 +186611,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -162017,27 +186637,27 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -162046,9 +186666,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -162056,11 +186676,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -162117,15 +186737,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1028 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1175 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -162138,10 +186758,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -162180,39 +186800,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -162220,12 +186840,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -162281,15 +186901,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1029 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1176 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -162302,7 +186922,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -162319,13 +186939,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -162336,7 +186956,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -162345,38 +186965,34 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 + LSCB: 32 LSPA: 8 - LSPB: 64 + LSPB: 16 LVCA: 32 - LVCB: 4 + LVCB: 16 LVPA: 4 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3088 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -162384,12 +187000,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -162400,7 +187016,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -162445,16 +187061,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1030 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1177 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -162466,10 +187082,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -162483,7 +187099,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -162500,7 +187116,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -162509,34 +187125,34 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 + LSCB: 32 LSPA: 8 - LSPB: 64 + LSPB: 16 LVCA: 32 - LVCB: 4 + LVCB: 16 LVPA: 4 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1040 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -162544,12 +187160,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -162605,15 +187221,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1031 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1178 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -162626,10 +187242,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -162643,7 +187259,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -162660,7 +187276,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -162668,28 +187284,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3088 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -162697,10 +187313,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -162708,12 +187324,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -162769,16 +187385,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1032 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1179 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -162790,10 +187406,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -162807,7 +187423,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -162824,7 +187440,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -162832,22 +187448,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 32 - LVCA: 32 - LVCB: 8 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -162860,10 +187476,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -162876,9 +187492,9 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -162933,16 +187549,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1033 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 + SolutionIndex: 1180 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -162954,10 +187570,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -162971,7 +187587,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -162996,35 +187612,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 16 + LSCB: 8 LSPA: 8 LSPB: 32 - LVCA: 32 - LVCB: 8 + LVCA: 8 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -163032,12 +187648,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -163049,7 +187665,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -163093,31 +187709,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1034 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1181 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -163131,13 +187747,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -163156,38 +187772,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 32 + LSCB: 8 + LSPA: 8 LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -163202,7 +187814,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -163212,7 +187824,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -163257,31 +187869,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1035 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 + SolutionIndex: 1182 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -163323,32 +187935,32 @@ LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 1024 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -163362,7 +187974,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -163417,29 +188029,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1036 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG8_16_4_WGM8 + SolutionIndex: 1183 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -163461,7 +188073,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -163483,36 +188095,32 @@ LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -163526,7 +188134,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -163536,7 +188144,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -163581,29 +188189,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1037 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_4_WGM8 + SolutionIndex: 1184 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -163619,7 +188227,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -163644,18 +188252,18 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 8 LSPA: 8 LSPB: 32 - LVCA: 32 - LVCB: 8 + LVCA: 16 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 528 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -163668,11 +188276,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -163680,12 +188288,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -163697,7 +188305,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -163741,8 +188349,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1038 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1185 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -163751,11 +188359,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -163765,7 +188373,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -163804,35 +188412,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -163840,12 +188448,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -163857,7 +188465,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -163901,29 +188509,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1039 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1186 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -163939,13 +188547,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -163964,28 +188572,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 - LSPB: 64 - LVCA: 32 - LVCB: 16 - LVPA: 16 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -163993,10 +188597,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -164004,13 +188608,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -164020,7 +188624,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -164065,31 +188669,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1040 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM1 + SolutionIndex: 1187 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 4] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -164103,7 +188707,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -164128,28 +188732,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 32 - LVCB: 16 + LVCA: 16 + LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -164157,9 +188761,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -164172,9 +188776,9 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -164229,31 +188833,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1041 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 + SolutionIndex: 1188 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 4] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -164267,13 +188871,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -164292,28 +188896,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 - LSPB: 64 - LVCA: 64 - LVCB: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 8 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1568 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -164321,9 +188921,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -164336,9 +188936,9 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -164348,8 +188948,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -164393,20 +188993,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1042 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG32_16_2_WGM1 + SolutionIndex: 1189 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -164414,10 +189014,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -164437,7 +189037,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -164451,27 +189051,23 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 LSPA: 32 - LSPB: 64 - LVCA: 32 + LSPB: 16 + LVCA: 8 LVCB: 16 LVPA: 16 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -164485,10 +189081,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -164496,13 +189092,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -164512,7 +189108,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -164557,29 +189153,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1043 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM8 + SolutionIndex: 1190 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -164615,27 +189211,27 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 LVCB: 16 - LVPA: 8 - LVPB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -164649,10 +189245,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -164660,13 +189256,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -164721,29 +189317,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1044 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 + SolutionIndex: 1191 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -164759,7 +189355,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -164767,56 +189363,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 16 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -164824,11 +189420,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -164885,31 +189481,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1045 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1192 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -164923,7 +189519,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -164931,56 +189527,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -164988,8 +189584,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -165049,31 +189645,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1046 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1193 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -165087,60 +189683,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 16 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -165148,11 +189748,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -165164,8 +189764,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -165209,31 +189809,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1047 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1194 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -165247,7 +189847,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -165256,7 +189856,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -165273,38 +189873,38 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 16 LVCA: 16 - LVCB: 4 + LVCB: 16 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -165312,11 +189912,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -165373,15 +189973,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1048 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1195 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -165394,10 +189994,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -165411,7 +190011,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -165420,7 +190020,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -165436,28 +190036,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 128 + LSCB: 32 LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSPB: 32 + LVCA: 32 + LVCB: 16 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -165465,10 +190065,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -165476,13 +190076,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -165537,31 +190137,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1049 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM16 + SolutionIndex: 1196 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [32, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -165575,7 +190175,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -165584,7 +190184,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -165600,18 +190200,18 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 6784 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -165621,18 +190221,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -165640,8 +190240,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -165701,20 +190301,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1050 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1197 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -165722,10 +190322,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -165748,7 +190348,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -165767,36 +190367,36 @@ LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 12416 + LdsNumElements: 6784 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -165804,12 +190404,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -165865,15 +190465,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1051 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1198 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -165886,8 +190486,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -165928,38 +190528,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 4096 LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -165968,11 +190568,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 512 PackBatchDims: 0 @@ -166029,14 +190629,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1052 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM1 + SolutionIndex: 1199 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] @@ -166050,8 +190650,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -166059,7 +190659,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166067,14 +190667,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -166091,36 +190691,37 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 32 + LSCB: 16 + LSPA: 16 LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -166132,13 +190733,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166148,6 +190751,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -166155,6 +190759,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166193,37 +190798,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1053 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM1 + SolutionIndex: 1200 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166231,7 +190834,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -166240,7 +190843,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -166255,40 +190858,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 32 - LVCB: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -166296,13 +190900,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166312,6 +190916,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -166319,6 +190924,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166357,20 +190963,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1054 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_8_4_WGM1 + SolutionIndex: 1201 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -166378,16 +190984,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166395,7 +191001,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -166403,45 +191009,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 + LSCA: 96 + LSCB: 8 + LSPA: 5 LSPB: 64 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -166449,9 +191056,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -166460,13 +191067,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166476,13 +191083,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166521,37 +191130,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1055 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM1 + SolutionIndex: 1202 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 4] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166559,7 +191168,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -166567,7 +191176,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -166575,47 +191184,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 32 + LSCA: 96 + LSCB: 8 + LSPA: 5 LSPB: 64 - LVCA: 32 - LVCB: 16 - LVPA: 8 + LVCA: 48 + LVCB: 4 + LVPA: 3 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -166624,13 +191234,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166640,13 +191250,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166685,37 +191297,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1056 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM1 + SolutionIndex: 1203 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166723,7 +191335,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -166731,7 +191343,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -166739,37 +191351,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -166777,10 +191390,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -166788,13 +191401,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166804,6 +191417,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -166811,6 +191425,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166849,37 +191464,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1057 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM8 + SolutionIndex: 1204 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166887,7 +191502,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -166895,45 +191510,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 + LSCA: 96 + LSCB: 8 + LSPA: 5 LSPB: 64 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -166941,9 +191557,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -166952,13 +191568,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166968,6 +191584,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -166975,6 +191592,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167013,37 +191631,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1058 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM8 + SolutionIndex: 1205 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 4] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -167051,15 +191669,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -167067,37 +191685,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 32 - LSPB: 64 + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -167105,10 +191724,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -167116,13 +191735,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167132,6 +191753,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -167139,6 +191761,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167177,37 +191800,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1059 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM8 + SolutionIndex: 1206 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -167215,7 +191836,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -167223,7 +191844,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -167235,29 +191856,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 LVPA: 4 - LVPB: 8 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 12416 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -167268,11 +191890,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -167280,12 +191902,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -167296,6 +191918,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -167303,6 +191926,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167341,8 +191965,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1060 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1207 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -167350,28 +191974,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -167379,53 +192003,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 + LSCA: 128 + LSCB: 8 + LSPA: 2 LSPB: 32 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -167433,10 +192058,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -167444,13 +192069,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167460,13 +192087,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167505,37 +192134,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1061 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR0_SNLL1_TT4_4_VW4_WG16_8_4_WGM16 + SolutionIndex: 1208 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -167543,53 +192170,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 + LSCA: 128 + LSCB: 8 + LSPA: 4 LSPB: 64 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -167597,9 +192225,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -167608,13 +192236,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167624,13 +192254,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167669,37 +192301,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1062 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM16 + SolutionIndex: 1209 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -167707,53 +192337,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 + LSCA: 128 + LSCB: 8 + LSPA: 4 LSPB: 64 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -167761,9 +192392,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -167772,13 +192403,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167788,6 +192421,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -167795,6 +192429,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167833,16 +192468,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1063 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM16 + SolutionIndex: 1210 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -167853,17 +192488,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 4] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -167871,14 +192504,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -167895,36 +192528,37 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 - LSPA: 32 + LSCB: 8 + LSPA: 8 LSPB: 64 LVCA: 32 - LVCB: 16 - LVPA: 8 + LVCB: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -167936,13 +192570,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167952,13 +192588,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167997,20 +192635,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1064 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM16 + SolutionIndex: 1211 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -168018,16 +192656,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -168041,9 +192677,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -168051,43 +192687,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 8 LSPA: 8 LSPB: 64 - LVCA: 8 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -168096,8 +192737,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -168112,13 +192755,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -168157,37 +192802,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1065 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1212 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -168201,9 +192844,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -168211,43 +192854,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 8 LSPA: 8 LSPB: 64 - LVCA: 8 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -168256,8 +192904,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -168272,13 +192922,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -168317,37 +192969,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1066 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1213 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -168355,14 +193005,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -168379,19 +193029,20 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -168401,7 +193052,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -168409,9 +193060,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -168420,12 +193071,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -168436,6 +193089,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -168443,6 +193097,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -168481,16 +193136,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1067 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 + SolutionIndex: 1214 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -168502,16 +193157,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -168525,7 +193178,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -168543,35 +193196,40 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 16 + LVCA: 64 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -168580,11 +193238,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -168596,13 +193254,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -168641,20 +193301,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1068 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 + SolutionIndex: 1215 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -168662,7 +193322,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -168671,7 +193331,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -168685,7 +193345,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -168703,35 +193363,40 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 16 + LVCA: 64 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -168740,11 +193405,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -168756,13 +193421,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -168801,20 +193468,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1069 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM16 + SolutionIndex: 1216 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -168822,8 +193489,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -168831,7 +193498,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -168847,7 +193514,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -168855,47 +193522,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 8 LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -168904,8 +193572,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -168920,13 +193588,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -168965,29 +193635,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1070 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 + SolutionIndex: 1217 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -168995,7 +193665,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -169003,15 +193673,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -169019,33 +193689,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -169053,9 +193728,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -169064,12 +193739,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -169080,13 +193755,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -169125,37 +193802,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1071 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM1 + SolutionIndex: 1218 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -169163,7 +193840,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -169171,7 +193848,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -169179,37 +193856,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -169217,9 +193895,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -169228,12 +193906,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -169244,6 +193922,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -169251,6 +193930,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -169289,37 +193969,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1072 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM1 + SolutionIndex: 1219 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -169327,15 +194007,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -169343,43 +194023,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -169388,12 +194073,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -169404,13 +194089,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -169449,37 +194136,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1073 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1220 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -169493,53 +194180,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -169548,12 +194240,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -169564,13 +194256,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -169609,28 +194303,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1074 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 + SolutionIndex: 1221 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -169639,7 +194333,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -169647,53 +194341,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 2 LSPB: 32 - LVCA: 16 + LVCA: 128 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -169701,10 +194396,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -169712,12 +194407,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -169728,6 +194425,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -169735,6 +194433,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -169773,37 +194472,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1075 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM8 + SolutionIndex: 1222 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -169811,14 +194508,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -169835,29 +194532,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -169865,10 +194563,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -169876,11 +194574,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -169892,13 +194592,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -169937,37 +194639,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1076 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 + SolutionIndex: 1223 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -169975,14 +194675,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -169999,40 +194699,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170040,11 +194741,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -170056,6 +194759,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -170063,6 +194767,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -170101,37 +194806,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1077 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG4_8_8_WGM8 + SolutionIndex: 1224 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 8, 8] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -170139,14 +194842,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -170163,29 +194866,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -170193,10 +194897,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170204,8 +194908,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -170220,13 +194926,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -170265,37 +194973,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1078 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM1 + SolutionIndex: 1225 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -170303,14 +195009,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -170327,29 +195033,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -170357,10 +195064,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170368,8 +195075,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -170384,13 +195093,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -170429,37 +195140,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1079 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM8 + SolutionIndex: 1226 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -170467,14 +195176,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -170491,29 +195200,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -170521,10 +195231,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170532,8 +195242,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -170548,6 +195260,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -170555,6 +195268,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -170593,37 +195307,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1080 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 + SolutionIndex: 1227 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -170631,14 +195343,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -170655,40 +195367,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170696,12 +195409,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -170712,6 +195427,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -170719,6 +195435,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -170757,37 +195474,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1081 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM1 + SolutionIndex: 1228 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -170795,14 +195510,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -170819,29 +195534,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -170849,10 +195565,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170860,11 +195576,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -170876,6 +195594,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -170883,6 +195602,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -170921,37 +195641,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1082 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM16 + SolutionIndex: 1229 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -170959,7 +195677,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -170983,15 +195701,16 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -171005,7 +195724,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -171013,10 +195732,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -171024,8 +195743,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -171040,13 +195759,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -171085,37 +195806,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1083 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 + SolutionIndex: 1230 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -171123,7 +195844,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -171131,56 +195852,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -171188,8 +195910,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -171204,6 +195926,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -171211,6 +195934,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -171249,37 +195973,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1084 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1231 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -171287,56 +196011,57 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 - LSPA: 16 + LSCB: 8 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 LVPA: 8 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -171350,6 +196075,8 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -171364,13 +196091,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -171409,37 +196138,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1085 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1232 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -171447,14 +196174,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -171471,36 +196198,37 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -171514,10 +196242,12 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -171528,6 +196258,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -171535,6 +196266,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -171573,37 +196305,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1086 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 + SolutionIndex: 1233 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -171635,6 +196365,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 @@ -171657,14 +196388,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -171692,6 +196423,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -171699,6 +196431,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -171737,28 +196470,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1087 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1234 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -171767,7 +196500,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -171799,6 +196532,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 @@ -171809,30 +196543,30 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -171840,12 +196574,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -171856,6 +196590,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -171863,6 +196598,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -171901,28 +196637,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1088 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1235 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -171931,7 +196667,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -171946,7 +196682,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -171963,6 +196699,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 @@ -171985,14 +196722,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -172006,6 +196743,8 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -172020,6 +196759,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -172027,6 +196767,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -172065,20 +196806,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1089 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 + SolutionIndex: 1236 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -172086,16 +196827,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172110,8 +196849,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -172123,29 +196862,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -172157,9 +196897,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -172168,8 +196908,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -172184,6 +196926,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -172191,6 +196934,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -172229,8 +196973,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1090 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 + SolutionIndex: 1237 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -172238,28 +196982,26 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172274,7 +197016,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -172291,6 +197033,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 16 @@ -172334,6 +197077,8 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -172348,6 +197093,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -172355,6 +197101,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -172393,8 +197140,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1091 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 1238 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -172402,7 +197149,7 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -172418,12 +197165,10 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172431,16 +197176,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -172455,23 +197200,24 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -172484,7 +197230,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -172498,9 +197244,11 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -172512,6 +197260,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -172519,6 +197268,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -172557,8 +197307,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1092 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 1239 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -172566,7 +197316,7 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -172579,15 +197329,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172595,7 +197343,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -172603,41 +197351,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -172648,11 +197397,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -172660,12 +197409,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -172676,6 +197425,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -172683,6 +197433,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -172721,8 +197472,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1093 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 + SolutionIndex: 1240 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -172730,28 +197481,28 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172759,7 +197510,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -172768,7 +197519,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -172783,23 +197534,24 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -172812,7 +197564,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -172828,7 +197580,7 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -172840,6 +197592,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -172847,6 +197600,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -172885,8 +197639,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1094 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 + SolutionIndex: 1241 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -172894,7 +197648,7 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -172907,15 +197661,15 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172932,7 +197686,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -172947,39 +197701,40 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 LSPA: 16 LSPB: 32 - LVCA: 32 - LVCB: 16 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -172994,7 +197749,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -173004,6 +197759,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -173011,6 +197767,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -173049,29 +197806,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1095 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG32_4_4_WGM1 + SolutionIndex: 1242 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -173079,7 +197836,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -173094,8 +197851,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -173107,29 +197864,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 8 LSCB: 32 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 + LVPA: 32 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -173141,9 +197899,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -173152,8 +197910,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -173168,6 +197928,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -173175,6 +197936,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -173213,37 +197975,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1096 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM1 + SolutionIndex: 1243 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -173258,56 +198018,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 4 + LVCB: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -173316,12 +198077,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -173332,6 +198095,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -173339,6 +198103,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -173377,37 +198142,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1097 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 1244 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -173421,39 +198184,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -173465,9 +198233,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -173476,12 +198244,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -173492,13 +198262,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -173537,37 +198309,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1098 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR1_SNLL0_TT4_4_VW4_WG8_8_4_WGM8 + SolutionIndex: 1245 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -173583,41 +198353,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 32 LSPA: 32 - LSPB: 32 + LSPB: 16 LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -173629,9 +198400,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -173640,12 +198411,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -173656,6 +198427,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -173663,6 +198435,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -173701,28 +198474,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1099 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM8 + SolutionIndex: 1246 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -173731,7 +198504,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -173747,7 +198520,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -173759,29 +198532,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 4 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -173793,9 +198567,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -173804,11 +198578,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -173820,6 +198594,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -173827,6 +198602,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -173865,28 +198641,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1100 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_8_4_WGM8 + SolutionIndex: 1247 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -173895,7 +198671,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -173910,8 +198686,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -173923,44 +198699,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 4 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 + LdsOffsetB_Blk: 4608 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -173968,12 +198745,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -173984,13 +198763,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -174029,16 +198810,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1101 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 1248 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR0_TT4_4_USFGRO0_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -174049,17 +198830,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -174074,42 +198853,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6272 + LdsNumElements: 3392 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -174122,9 +198902,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -174132,11 +198912,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -174148,6 +198930,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -174155,6 +198938,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -174193,37 +198977,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1102 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 + SolutionIndex: 1249 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -174237,54 +199019,59 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 32 - LVCB: 16 - LVPA: 4 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -174292,8 +199079,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 @@ -174308,13 +199097,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -174353,37 +199144,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1103 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM1 + SolutionIndex: 1250 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO1_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -174397,54 +199186,59 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 32 - LVCB: 16 - LVPA: 4 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -174452,8 +199246,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 @@ -174468,13 +199264,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -174513,37 +199311,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1104 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM8 + SolutionIndex: 1251 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -174558,7 +199354,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -174568,13 +199364,14 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 @@ -174585,13 +199382,13 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -174606,9 +199403,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -174616,12 +199413,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -174632,13 +199431,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -174677,20 +199478,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1105 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1252 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO0_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -174698,16 +199499,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -174715,49 +199514,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -174768,11 +199568,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -174781,11 +199581,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -174796,6 +199598,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -174803,6 +199606,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -174841,15 +199645,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1106 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM16 + SolutionIndex: 1253 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -174861,17 +199665,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -174886,53 +199688,54 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 32 - LVCA: 8 - LVCB: 4 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -174940,13 +199743,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -174956,13 +199762,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -175001,37 +199809,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1107 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1254 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -175046,36 +199852,37 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 32 - LVCA: 16 - LVCB: 4 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -175089,10 +199896,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -175100,13 +199907,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175116,13 +199926,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -175161,8 +199973,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1108 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1255 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -175171,27 +199983,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -175205,37 +200015,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -175249,10 +200064,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -175260,13 +200075,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175276,13 +200094,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -175321,8 +200141,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1109 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1256 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -175331,27 +200151,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -175365,37 +200183,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -175409,10 +200232,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -175420,13 +200243,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175436,13 +200262,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -175481,8 +200309,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1110 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1257 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -175491,27 +200319,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -175525,7 +200351,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -175543,21 +200369,26 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 8 LSPA: 8 - LSPB: 32 - LVCA: 16 + LSPB: 64 + LVCA: 32 LVCB: 4 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 528 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -175569,10 +200400,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -175580,13 +200411,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175596,13 +200428,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -175641,8 +200475,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1111 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1258 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -175651,11 +200485,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -175663,7 +200497,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -175671,7 +200505,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -175685,54 +200519,59 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -175740,13 +200579,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175756,13 +200596,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -175801,28 +200643,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1112 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1259 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -175831,7 +200673,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -175845,8 +200687,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -175863,36 +200705,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -175900,13 +200747,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175916,13 +200766,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -175961,37 +200813,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1113 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1260 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -175999,7 +200849,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -176023,29 +200873,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3104 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -176053,10 +200904,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -176064,13 +200915,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -176080,6 +200932,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -176087,6 +200940,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -176125,37 +200979,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1114 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1261 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -176163,14 +201017,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -176187,21 +201041,26 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1568 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -176212,10 +201071,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -176224,13 +201083,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -176240,13 +201102,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -176285,8 +201149,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1115 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1262 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -176295,10 +201159,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -176307,15 +201171,13 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -176323,15 +201185,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -176339,33 +201201,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -176373,10 +201240,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -176384,13 +201251,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -176400,13 +201270,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -176445,37 +201317,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1116 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM16 + SolutionIndex: 1263 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -176483,15 +201353,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -176499,37 +201369,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -176537,10 +201408,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -176548,13 +201419,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -176564,6 +201438,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -176571,6 +201446,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -176609,37 +201485,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1117 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM16 + SolutionIndex: 1264 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -176647,64 +201521,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -176712,13 +201587,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -176728,6 +201606,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -176735,6 +201614,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -176773,37 +201653,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1118 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM16 + SolutionIndex: 1265 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -176811,7 +201689,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -176819,7 +201697,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -176827,37 +201705,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -176865,10 +201744,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -176876,13 +201755,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -176892,6 +201772,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -176899,6 +201780,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -176937,37 +201819,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1119 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM16 + SolutionIndex: 1266 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -176975,7 +201857,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -176983,7 +201865,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -176991,37 +201873,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -177029,10 +201912,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -177040,13 +201923,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -177056,6 +201940,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -177063,6 +201948,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -177101,37 +201987,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1120 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM16 + SolutionIndex: 1267 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -177139,53 +202025,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -177193,10 +202076,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -177205,12 +202088,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -177220,13 +202106,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -177265,37 +202153,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1121 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM1 + SolutionIndex: 1268 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -177303,63 +202189,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 16 + LSCA: 64 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 32 - LVCB: 16 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 784 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -177369,12 +202252,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -177384,13 +202270,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -177429,37 +202317,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1122 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM1 + SolutionIndex: 1269 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR0_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 2] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -177467,7 +202353,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -177475,8 +202361,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -177487,44 +202373,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LVCB: 4 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -177532,13 +202419,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -177548,6 +202436,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -177555,6 +202444,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -177593,37 +202483,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1123 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM8 + SolutionIndex: 1270 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 4, 4] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -177631,64 +202521,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCB: 16 + LSPA: 4 LSPB: 16 - LVCA: 16 + LVCA: 64 LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -177696,13 +202583,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -177712,13 +202602,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -177757,37 +202649,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1124 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM16 + SolutionIndex: 1271 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -177795,53 +202685,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 LVCB: 16 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -177849,10 +202736,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -177860,13 +202747,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -177876,13 +202766,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -177921,33 +202813,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1125 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM16 + SolutionIndex: 1272 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -177959,13 +202849,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -177986,23 +202876,19 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -178013,7 +202899,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -178027,13 +202913,14 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -178044,7 +202931,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -178090,8 +202977,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1126 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1273 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -178114,7 +203001,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -178132,7 +203019,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -178161,15 +203048,11 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -178199,6 +203082,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -178209,7 +203093,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -178255,8 +203139,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1127 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1274 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -178277,7 +203161,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -178293,54 +203177,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 8 - LSPA: 5 - LSPB: 64 - LVCA: 48 - LVCB: 4 - LVPA: 3 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -178348,10 +203228,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -178359,13 +203239,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -178376,8 +203259,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -178422,33 +203305,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1128 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1275 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -178460,15 +203341,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -178476,38 +203357,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 8 - LSPA: 5 - LSPB: 64 - LVCA: 48 - LVCB: 4 - LVPA: 3 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -178515,10 +203396,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -178526,13 +203407,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -178544,7 +203428,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -178589,33 +203473,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1129 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1276 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -178627,7 +203509,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -178635,7 +203517,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -178643,38 +203525,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 8 - LSPA: 5 - LSPB: 64 - LVCA: 48 - LVCB: 4 - LVPA: 3 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -178682,10 +203564,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -178693,13 +203575,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -178756,32 +203639,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1130 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1277 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -178794,54 +203677,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 8 - LSPA: 5 - LSPB: 64 - LVCA: 48 - LVCB: 4 - LVPA: 3 - LVPB: 32 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -178849,10 +203732,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -178860,13 +203743,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -178923,33 +203809,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1131 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1278 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW4_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -178961,7 +203845,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -178969,57 +203853,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 8 - LSPB: 32 + LSPB: 8 LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6656 + LdsNumElements: 3392 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -179027,15 +203911,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -179092,31 +203977,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1132 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1279 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -179128,7 +204013,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -179137,14 +204022,14 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -179154,28 +204039,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 96 - LVCA: 32 - LVCB: 2 - LVPA: 4 - LVPB: 24 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -179183,10 +204068,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -179194,13 +204079,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -179257,31 +204143,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1133 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1280 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -179295,7 +204181,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -179303,46 +204189,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 2 - LSPB: 32 - LVCA: 128 - LVCB: 8 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -179350,10 +204236,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -179361,15 +204247,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -179426,35 +204313,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1134 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_WG16_16_1_WGM8 + SolutionIndex: 1281 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -179468,42 +204355,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 LSPA: 4 - LSPB: 64 + LSPB: 32 LVCA: 64 - LVCB: 4 - LVPA: 2 + LVCB: 8 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -179517,9 +204400,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -179528,15 +204411,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -179547,7 +204431,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -179593,8 +204477,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1135 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1282 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -179603,17 +204487,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -179636,7 +204520,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -179645,7 +204529,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -179655,22 +204539,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -179684,9 +204568,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -179695,15 +204579,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -179760,8 +204643,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1136 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1283 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -179770,10 +204653,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -179785,10 +204668,12 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -179804,30 +204689,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 LVPA: 2 LVPB: 32 LdcEqualsLdd: false @@ -179863,14 +204748,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -179882,7 +204768,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -179927,8 +204813,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1137 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1284 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -179943,11 +204829,11 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -179971,7 +204857,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -179979,11 +204865,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -179991,9 +204877,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 LVPB: 32 @@ -180030,14 +204916,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -180094,8 +204981,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1138 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1285 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -180114,7 +205001,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -180122,7 +205009,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -180136,42 +205023,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 2 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -180185,10 +205068,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -180196,8 +205079,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -180205,6 +205088,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -180215,8 +205099,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -180261,8 +205145,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1139 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1286 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -180271,17 +205155,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -180289,7 +205173,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -180297,7 +205181,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -180305,42 +205189,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 32 + LSCB: 8 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1544 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -180351,11 +205235,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -180363,15 +205247,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -180383,7 +205268,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -180428,8 +205313,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1140 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1287 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB1_PGR1_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -180438,25 +205323,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -180470,44 +205355,40 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 - LVPA: 2 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 520 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -180519,10 +205400,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -180530,13 +205411,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -180547,7 +205431,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -180593,8 +205477,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1141 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1288 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -180603,27 +205487,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -180631,50 +205513,46 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1040 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -180685,11 +205563,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -180697,13 +205575,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -180714,8 +205595,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -180760,8 +205641,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1142 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1289 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -180770,27 +205651,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -180805,41 +205684,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 2 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -180853,9 +205732,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -180864,13 +205743,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -180927,8 +205809,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1143 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1290 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR0_TT2_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -180937,23 +205819,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -180972,58 +205852,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 2 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -181031,13 +205911,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -181049,7 +205932,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -181094,37 +205977,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1144 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1291 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -181132,65 +206013,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -181198,13 +206075,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -181215,7 +206095,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -181261,37 +206141,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1145 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1292 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_AMAS1_EPS0_FL0_GRVW1_GSU8_LPB1_PGR0_PLR1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 1 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -181299,15 +206177,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -181315,49 +206193,45 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -181365,13 +206239,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -181382,7 +206259,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -181428,33 +206305,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1146 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1293 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_AMAS1_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -181466,7 +206341,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -181475,7 +206350,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -181492,39 +206367,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 6784 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -181532,13 +206407,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -181595,20 +206471,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1147 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1294 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -181616,10 +206492,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -181633,7 +206509,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -181641,32 +206517,32 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 2 - LSPB: 32 - LVCA: 128 - LVCB: 8 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -181687,27 +206563,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -181748,6 +206627,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -181764,8 +206644,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1148 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_WG16_16_1_WGM8 + SolutionIndex: 1295 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -181774,21 +206654,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -181800,7 +206680,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -181808,32 +206688,32 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 + LSCA: 64 + LSCB: 16 + LSPA: 16 LSPB: 64 - LVCA: 64 + LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 32 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -181854,27 +206734,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -181886,7 +206769,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -181915,6 +206798,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -181931,8 +206815,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1149 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1296 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -181941,21 +206825,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -181967,7 +206851,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -181975,40 +206859,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -182021,20 +206905,22 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -182042,6 +206928,7 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -182082,6 +206969,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -182098,8 +206986,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1150 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1297 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182108,21 +206996,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -182134,14 +207022,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -182160,14 +207048,14 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -182188,27 +207076,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -182220,7 +207109,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -182249,6 +207138,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -182265,8 +207155,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1151 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1298 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182275,11 +207165,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -182289,7 +207179,9 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -182301,14 +207193,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -182327,14 +207219,14 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -182355,27 +207247,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -182387,7 +207280,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -182416,6 +207309,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -182432,8 +207326,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1152 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1299 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182442,11 +207336,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -182456,7 +207350,9 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -182475,8 +207371,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -182488,28 +207384,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 96 LVCA: 32 LVCB: 2 - LVPA: 2 - LVPB: 32 + LVPA: 4 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -182523,26 +207419,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -182554,7 +207451,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -182583,6 +207480,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -182599,8 +207497,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1153 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1300 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182609,21 +207507,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -182642,8 +207542,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -182655,28 +207555,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 96 LVCA: 32 LVCB: 2 - LVPA: 2 - LVPB: 32 + LVPA: 4 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -182690,26 +207590,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -182750,6 +207651,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -182766,8 +207668,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1154 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1301 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182776,21 +207678,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -182802,15 +207706,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -182822,28 +207726,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 8 - LSPB: 64 + LSPB: 96 LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LVCB: 2 + LVPA: 4 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -182856,27 +207760,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -182917,6 +207822,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -182933,8 +207839,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1155 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1302 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182943,21 +207849,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -182976,9 +207884,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -182998,15 +207906,15 @@ LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 64 LVCA: 32 - LVCB: 2 + LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -183025,23 +207933,28 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -183082,6 +207995,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -183098,8 +208012,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1156 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1303 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -183108,11 +208022,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -183120,11 +208034,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -183143,9 +208055,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -183165,15 +208077,15 @@ LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 64 LVCA: 32 - LVCB: 2 + LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -183192,23 +208104,28 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -183220,7 +208137,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -183249,6 +208166,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -183265,8 +208183,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1157 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1304 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -183275,11 +208193,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -183290,8 +208208,6 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -183309,38 +208225,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -183354,19 +208274,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -183374,6 +208296,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -183384,8 +208307,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -183414,6 +208337,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -183430,8 +208354,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1158 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_WG16_16_1_WGM1 + SolutionIndex: 1305 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -183440,17 +208364,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -183466,7 +208390,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -183474,7 +208398,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -183482,65 +208406,68 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -183581,6 +208508,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -183597,31 +208525,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1159 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 + SolutionIndex: 1306 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -183633,54 +208561,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -183688,24 +208616,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -183746,6 +208679,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -183762,33 +208696,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1160 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 + SolutionIndex: 1307 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -183800,7 +208732,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -183826,53 +208758,56 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -183913,6 +208848,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -183929,31 +208865,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1161 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1308 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -183967,15 +208903,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -183983,65 +208919,66 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184053,7 +208990,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -184082,6 +209019,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -184098,31 +209036,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1162 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM1 + SolutionIndex: 1309 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -184134,15 +209074,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -184154,34 +209094,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3104 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -184189,26 +209129,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184220,7 +209161,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -184249,6 +209190,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -184265,31 +209207,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1163 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1310 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -184301,14 +209245,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -184327,28 +209271,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -184356,26 +209300,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184416,6 +209361,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -184432,20 +209378,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1164 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1311 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -184453,10 +209399,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -184468,16 +209416,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -184494,55 +209442,56 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184583,6 +209532,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -184599,20 +209549,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1165 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1312 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -184620,10 +209570,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -184635,79 +209587,84 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 + LSCB: 16 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184748,6 +209705,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -184764,15 +209722,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1166 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1313 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -184784,13 +209742,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -184802,14 +209758,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -184829,27 +209785,27 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -184858,23 +209814,28 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184915,6 +209876,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -184931,15 +209893,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1167 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1314 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -184952,12 +209914,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -184978,7 +209938,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -184998,15 +209958,15 @@ LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 + LdsNumElements: 6784 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -185016,32 +209976,35 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -185082,6 +210045,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -185098,15 +210062,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1168 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1315 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -185119,7 +210083,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -185136,7 +210100,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -185145,45 +210109,45 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 + LSCA: 64 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 8 + LVCA: 64 LVCB: 8 - LVPA: 32 - LVPB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -185191,26 +210155,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -185251,12 +210218,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -185267,31 +210236,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1169 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_WG4_16_4_WGM1 + SolutionIndex: 1316 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -185303,7 +210272,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -185323,34 +210292,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3104 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -185358,26 +210327,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -185418,12 +210390,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -185434,31 +210408,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1170 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM1 + SolutionIndex: 1317 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -185470,13 +210444,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -185490,61 +210464,60 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -185555,7 +210528,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -185585,12 +210558,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -185601,31 +210576,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1171 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM8 + SolutionIndex: 1318 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -185637,79 +210612,84 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -185750,12 +210730,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -185766,33 +210748,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1172 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 1319 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -185804,15 +210784,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -185824,59 +210804,64 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -185917,12 +210902,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -185933,33 +210920,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1173 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 1320 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -185971,15 +210956,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -185991,61 +210976,60 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 512 - LdsOffsetB_Blk: 4608 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -186056,8 +211040,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -186086,12 +211070,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -186102,14 +211088,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1174 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR0_TT4_4_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 1321 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -186122,11 +211108,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -186138,7 +211124,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -186146,73 +211132,76 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 64 LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -186253,12 +211242,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -186269,31 +211260,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1175 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 1322 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -186305,54 +211296,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 8 LSPA: 8 - LSPB: 8 + LSPB: 64 LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3088 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -186360,26 +211351,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -186391,7 +211383,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -186420,12 +211412,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -186436,31 +211430,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1176 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO1_WG16_4_4_WGM8 + SolutionIndex: 1323 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -186472,81 +211468,82 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -186587,12 +211584,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -186603,31 +211602,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1177 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_WG16_4_4_WGM8 + SolutionIndex: 1324 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -186639,81 +211640,82 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -186725,7 +211727,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -186754,12 +211756,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -186770,31 +211774,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1178 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 1325 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -186806,7 +211812,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -186814,7 +211820,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -186822,65 +211828,68 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -186892,7 +211901,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -186921,12 +211930,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -186937,31 +211948,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1179 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 1326 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -186979,7 +211990,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -186993,7 +212004,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -187008,11 +212019,15 @@ LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -187025,24 +212040,26 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -187055,8 +212072,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -187085,12 +212102,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -187101,8 +212120,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1180 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 1327 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -187111,19 +212130,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -187137,7 +212156,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -187164,19 +212183,19 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 + LSCB: 16 LSPA: 4 - LSPB: 32 + LSPB: 16 LVCA: 64 - LVCB: 8 + LVCB: 16 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3136 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -187187,26 +212206,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -187220,7 +212241,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -187249,12 +212270,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -187265,8 +212288,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1181 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1328 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -187275,11 +212298,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -187287,9 +212310,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -187301,7 +212324,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -187309,7 +212332,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -187321,28 +212344,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -187355,20 +212378,22 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -187417,12 +212442,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -187433,8 +212460,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1182 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1329 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -187443,21 +212470,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -187469,7 +212496,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -187477,36 +212504,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 4 LSPB: 64 - LVCA: 16 + LVCA: 64 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -187523,25 +212550,27 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -187585,12 +212614,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -187601,8 +212632,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1183 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1330 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -187611,21 +212642,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -187637,14 +212668,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -187663,22 +212694,18 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -187691,24 +212718,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -187721,7 +212752,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -187751,12 +212782,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -187767,8 +212800,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1184 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1331 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -187777,10 +212810,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -187791,9 +212824,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -187805,16 +212836,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -187831,18 +212862,18 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -187859,20 +212890,24 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -187890,7 +212925,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -187919,12 +212954,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -187935,8 +212972,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1185 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1332 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -187945,10 +212982,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -187959,9 +212996,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -187973,7 +213008,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -187981,7 +213016,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -187989,32 +213024,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -188027,26 +213062,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -188089,12 +213126,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -188105,8 +213144,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1186 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1333 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -188115,21 +213154,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -188148,8 +213187,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -188157,32 +213196,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 128 LSCB: 8 - LSPA: 5 + LSPA: 8 LSPB: 64 - LVCA: 48 + LVCA: 32 LVCB: 4 - LVPA: 3 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -188196,22 +213235,26 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -188255,12 +213298,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -188271,8 +213316,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1187 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1334 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -188281,23 +213326,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -188317,7 +213360,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -188325,11 +213368,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -188337,22 +213380,22 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -188371,17 +213414,19 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -188396,7 +213441,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -188425,12 +213470,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -188441,8 +213488,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1188 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1335 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -188461,9 +213508,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -188512,15 +213559,15 @@ LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -188539,7 +213586,9 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -188593,12 +213642,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -188609,8 +213660,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1189 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1336 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -188645,7 +213696,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -188654,7 +213705,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -188672,12 +213723,184 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 + LSCB: 16 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1337 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false LdsNumElements: 3584 @@ -188707,17 +213930,17 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -188761,12 +213984,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -188777,8 +214002,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1190 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1338 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -188797,11 +214022,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -188813,48 +214040,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 - LSPA: 8 + LSCB: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -188867,7 +214094,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -188875,14 +214102,14 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -188929,12 +214156,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -188945,8 +214174,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1191 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1339 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -188965,11 +214194,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -189043,7 +214274,9 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -189095,12 +214328,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -189111,8 +214346,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1192 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1340 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -189211,7 +214446,9 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -189263,12 +214500,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -189279,8 +214518,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1193 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1341 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -189323,40 +214562,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -189368,21 +214611,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -189399,8 +214642,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -189429,12 +214672,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -189445,8 +214690,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1194 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 1342 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -189455,21 +214700,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -189487,7 +214734,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -189501,26 +214748,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 2 LSPB: 32 - LVCA: 64 + LVCA: 128 LVCB: 8 - LVPA: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 784 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -189532,25 +214783,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -189563,8 +214816,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -189593,12 +214846,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -189609,8 +214864,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1195 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR0_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 1343 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -189619,19 +214874,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -189645,23 +214900,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -189671,24 +214926,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 4 LSPB: 64 - LVCA: 16 + LVCA: 64 LVCB: 4 - LVPA: 8 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -189699,24 +214954,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -189759,12 +215018,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -189775,8 +215036,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1196 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1344 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -189785,11 +215046,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -189799,9 +215060,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -189813,46 +215072,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 128 + LSCB: 8 LSPA: 4 - LSPB: 16 + LSPB: 64 LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -189863,26 +215126,26 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -189895,8 +215158,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -189925,12 +215188,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -189941,8 +215206,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1197 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 1345 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -189951,13 +215216,13 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -189965,7 +215230,9 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -189977,46 +215244,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -190027,26 +215298,26 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -190059,7 +215330,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -190089,12 +215360,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -190105,8 +215378,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1198 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1346 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -190115,21 +215388,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -190141,16 +215416,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -190168,17 +215443,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCB: 8 + LSPA: 8 LSPB: 32 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4224 + LdsNumElements: 1824 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -190191,27 +215470,29 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -190223,8 +215504,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -190253,12 +215534,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -190269,15 +215552,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1199 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1347 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -190290,10 +215573,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -190305,16 +215588,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -190332,17 +215615,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 16 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 1824 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -190355,15 +215642,17 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -190373,7 +215662,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -190385,8 +215674,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -190415,12 +215704,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -190431,15 +215722,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1200 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1348 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -190452,10 +215743,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -190469,50 +215760,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 800 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -190520,26 +215811,26 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -190581,12 +215872,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -190597,8 +215890,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1201 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_8_2_WGM8 + SolutionIndex: 1349 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -190607,21 +215900,23 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -190633,7 +215928,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -190641,46 +215936,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 LVCB: 8 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1680 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 192 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -190689,25 +215984,27 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -190749,12 +216046,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -190765,8 +216064,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1202 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + SolutionIndex: 1350 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -190775,21 +216074,21 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -190801,16 +216100,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -190828,27 +216127,27 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 + LSCB: 8 + LSPA: 8 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 4 + LVCB: 2 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -190857,23 +216156,27 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -190886,7 +216189,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -190915,12 +216218,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -190931,8 +216236,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1203 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + SolutionIndex: 1351 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -190941,23 +216246,21 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -190969,54 +216272,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 + LSCA: 64 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 32 - LVPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -191024,26 +216327,26 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -191056,7 +216359,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -191085,12 +216388,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -191101,31 +216406,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1204 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW4_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 1352 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -191137,13 +216444,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 - DirectToLds: false - DirectToLdsA: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -191157,61 +216464,59 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 1 + LSPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 1296 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -191223,7 +216528,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -191253,12 +216558,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -191269,31 +216576,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1205 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + SolutionIndex: 1353 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB2_PGR0_PLR1_TT8_4_USFGRO1_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -191305,79 +216612,79 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 - DirectToLds: false - DirectToLdsA: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 1 LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1312 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -191389,7 +216696,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -191419,12 +216726,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -191435,33 +216744,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1206 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 1354 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -191473,81 +216780,79 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 - DirectToLds: false - DirectToLdsA: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 1 LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 1312 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -191559,7 +216864,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -191589,12 +216894,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -191605,35 +216912,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1207 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 1355 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -191641,7 +216948,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -191649,36 +216956,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 4 + LSCA: 32 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 64 + LVCA: 16 LVCB: 8 - LVPA: 4 - LVPB: 32 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 2560 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -191691,25 +216998,27 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 NumLoadsB: 2 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -191753,12 +217062,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -191769,8 +217080,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1208 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1356 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR0_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -191779,21 +217090,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -191805,14 +217116,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -191821,7 +217132,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -191831,22 +217142,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -191859,24 +217170,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -191919,12 +217234,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -191935,8 +217252,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1209 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1357 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -191945,10 +217262,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -191957,15 +217274,13 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -191980,41 +217295,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 96 LSCB: 8 - LSPA: 2 - LSPB: 32 - LVCA: 128 - LVCB: 8 - LVPA: 2 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -192028,25 +217343,25 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -192089,12 +217404,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -192105,8 +217422,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1210 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1358 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -192115,21 +217432,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -192148,7 +217467,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -192167,22 +217486,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 96 LSCB: 8 - LSPA: 4 + LSPA: 5 LSPB: 64 - LVCA: 64 + LVCA: 48 LVCB: 4 - LVPA: 2 + LVPA: 3 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -192196,21 +217515,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -192257,12 +217576,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -192273,8 +217594,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1211 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1359 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -192283,10 +217604,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -192295,13 +217616,15 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -192315,38 +217638,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -192360,25 +217687,25 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -192391,8 +217718,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -192421,12 +217748,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -192437,8 +217766,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1212 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1360 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -192447,25 +217776,27 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 32 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -192480,43 +217811,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 96 LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 1544 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 320 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -192528,21 +217859,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -192560,7 +217891,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -192589,12 +217920,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -192605,8 +217938,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1213 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB1_PGR1_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1361 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -192615,25 +217948,27 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -192641,50 +217976,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 32 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 520 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -192692,19 +218031,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -192723,8 +218064,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -192753,12 +218094,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -192769,35 +218112,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1214 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1362 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -192811,70 +218154,74 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1040 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -192887,8 +218234,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -192917,12 +218264,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -192933,35 +218282,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1215 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1363 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -192969,7 +218320,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -192995,28 +218346,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 3392 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -193024,25 +218375,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -193056,7 +218409,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -193085,12 +218438,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -193101,31 +218456,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1216 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR0_TT2_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1364 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -193137,7 +218492,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -193146,71 +218501,73 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3392 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -193253,12 +218610,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -193269,35 +218628,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1217 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM8 + SolutionIndex: 1365 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -193311,18 +218670,18 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -193331,50 +218690,56 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 8 LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -193387,7 +218752,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -193417,12 +218782,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -193433,35 +218800,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1218 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_AMAS1_EPS0_FL0_GRVW1_GSU8_LPB1_PGR0_PLR1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM1 + SolutionIndex: 1366 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -193469,23 +218836,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -193495,20 +218862,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -193519,20 +218890,22 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -193551,7 +218924,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -193581,12 +218954,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -193597,8 +218972,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1219 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_AMAS1_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM1 + SolutionIndex: 1367 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -193607,21 +218982,21 @@ SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -193640,43 +219015,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 4 + LVCB: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -193688,23 +219063,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -193747,12 +219126,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -193763,33 +219144,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1220 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 1368 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 4, 4] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -193801,65 +219180,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -193869,14 +219248,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -193926,6 +219303,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -193936,31 +219314,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1221 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1369 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -193972,15 +219352,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -193992,45 +219372,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -194040,10 +219420,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -194097,6 +219475,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -194107,31 +219486,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1222 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1370 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -194143,7 +219524,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -194151,56 +219532,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -194211,13 +219592,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -194268,6 +219649,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -194278,14 +219660,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1223 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1371 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW2_GSU8_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -194294,15 +219676,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -194314,65 +219696,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 8 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3392 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -194382,11 +219764,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -194437,6 +219821,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -194447,33 +219832,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1224 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1372 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -194485,65 +219868,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 8 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3392 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -194553,11 +219936,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -194608,6 +219993,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -194618,33 +220004,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1225 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1373 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -194656,7 +220040,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -194665,45 +220049,45 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 96 - LVCA: 32 - LVCB: 2 - LVPA: 4 - LVPB: 24 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -194711,10 +220095,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -194724,11 +220108,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -194743,7 +220127,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -194779,6 +220163,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -194789,31 +220174,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1226 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1374 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -194827,23 +220212,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -194853,28 +220238,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 96 - LVCA: 32 - LVCB: 2 - LVPA: 4 - LVPB: 24 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -194882,10 +220267,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -194895,11 +220280,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -194950,6 +220337,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -194960,33 +220348,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1227 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1375 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -194998,7 +220384,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -195007,14 +220393,14 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -195024,28 +220410,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 96 - LVCA: 32 - LVCB: 2 - LVPA: 4 - LVPB: 24 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -195053,10 +220439,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -195066,11 +220452,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -195121,6 +220507,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -195131,31 +220518,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1228 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1376 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -195169,7 +220556,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -195177,57 +220564,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -195237,13 +220624,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -195258,7 +220645,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -195294,6 +220681,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -195304,31 +220692,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1229 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1377 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -195340,7 +220728,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -195349,7 +220737,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -195366,39 +220754,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -195408,8 +220796,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -195429,7 +220817,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -195465,6 +220853,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -195475,20 +220864,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1230 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1378 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -195496,10 +220885,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -195511,7 +220900,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -195537,28 +220926,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -195566,10 +220955,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -195579,13 +220968,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -195636,6 +221025,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -195646,20 +221036,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1231 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1379 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -195667,10 +221057,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -195682,14 +221072,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -195708,28 +221098,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -195737,10 +221127,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -195750,13 +221140,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -195807,6 +221195,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -195817,20 +221206,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1232 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1380 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -195838,10 +221227,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -195853,16 +221244,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -195879,39 +221270,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 6784 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -195921,10 +221312,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -195978,6 +221367,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -195988,20 +221378,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1233 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1381 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -196009,10 +221399,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -196024,7 +221416,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -196050,28 +221442,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -196079,10 +221471,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -196092,8 +221484,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -196147,30 +221539,33 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1234 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1382 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -196178,10 +221573,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -196195,7 +221590,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -196203,7 +221598,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -196211,38 +221606,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -196250,10 +221645,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -196263,11 +221658,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -196282,7 +221677,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -196318,41 +221713,44 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1235 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1383 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -196366,7 +221764,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -196392,28 +221790,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -196421,10 +221819,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -196434,11 +221832,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -196453,7 +221851,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -196489,30 +221887,33 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1236 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1384 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -196520,10 +221921,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -196544,7 +221945,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -196607,6 +222008,8 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -196624,7 +222027,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -196660,18 +222063,21 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1237 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1385 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -196692,11 +222098,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -196715,7 +222119,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -196778,6 +222182,8 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -196831,18 +222237,21 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1238 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1386 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -196863,11 +222272,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -196888,7 +222295,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -196905,39 +222312,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -196947,13 +222354,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -197004,30 +222411,33 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1239 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 1387 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -197035,7 +222445,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -197050,14 +222460,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -197076,28 +222486,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -197105,10 +222515,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -197118,13 +222528,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -197139,7 +222547,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -197175,30 +222583,33 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1240 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + SolutionIndex: 1388 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -197206,10 +222617,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -197221,7 +222634,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -197247,28 +222660,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -197276,10 +222689,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -197289,11 +222702,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -197344,30 +222757,33 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1241 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + SolutionIndex: 1389 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -197375,10 +222791,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - [2, 3, 0, 1] @@ -199782,36 +225198,18 @@ - [101, 4291.65] - - [3136, 64, 128, 64] - [183, 8175.06] - - - [784, 512, 64, 128] - - [181, 8378.34] - - - [3136, 256, 64, 64] - - [184, 8506.65] - - - [12544, 1024, 1, 256] - - [177, 8927.93] - - [784, 128, 128, 512] - [182, 8190.53] - - [784, 512, 256, 128] - [180, 8637.14] - - - [3136, 64, 64, 256] - - [179, 8782.93] - - - [3136, 512, 1, 2048] - - [176, 7298.32] - - - [12544, 256, 1, 1024] - - [188, 7667.25] - - - [3136, 2048, 1, 512] - - [187, 8447.22] - - [3136, 256, 256, 64] - [180, 8663.08] - - [3136, 64, 128, 256] - [178, 8943.46] - - - [784, 128, 64, 512] - - [186, 8006.27] - - [3136, 64, 256, 64] - [183, 8267.12] - - [784, 512, 128, 128] - [180, 8564.25] - - - [3136, 64, 64, 64] - - [183, 8009.35] - - [784, 128, 256, 512] - [184, 8377.06] - - [3136, 64, 256, 256] @@ -199896,8 +225294,6 @@ - [213, 4032.98] - - [1024, 256, 1, 4096] - [201, 7326.3] - - - [4096, 512, 1, 4096] - - [193, 9471.97] - - [1024, 200, 1, 2048] - [194, 5530.46] - - [2048, 1024, 1, 512] @@ -203614,6236 +229010,6688 @@ - [608, 9352.16] - - [256, 8976, 1, 44505] - [612, 8430.23] + - - [6272, 256, 1, 528] + - [664, 7389.94] + - - [3136, 2048, 1, 1024] + - [645, 9657.94] + - - [6272, 112, 1, 512] + - [643, 5931.09] + - - [2048, 320, 1, 1280] + - [663, 7772.99] + - - [289, 256, 1, 1568] + - [684, 3718.17] + - - [3136, 64, 64, 64] + - [623, 8201.15] + - - [50176, 128, 1, 256] + - [646, 8908.58] + - - [5329, 64, 1, 448] + - [629, 4602.2] + - - [289, 192, 1, 1344] + - [681, 3452.59] + - - [12544, 1024, 1, 256] + - [646, 9742.64] + - - [784, 64, 32, 192] + - [622, 6844.61] + - - [6272, 64, 1, 480] + - [630, 5562.24] + - - [196, 128, 1, 800] + - [672, 1639.74] + - - [64, 512, 1, 1344] + - [671, 2313.04] + - - [6272, 64, 1, 512] + - [629, 5609.19] + - - [6272, 160, 1, 528] + - [630, 6149.7] + - - [289, 160, 32, 768] + - [657, 6637.82] + - - [12544, 256, 1, 1024] + - [664, 8790.46] + - - [289, 224, 1, 1568] + - [684, 3270.17] + - - [5329, 64, 32, 160] + - [637, 9091.04] + - - [5329, 96, 1, 576] + - [664, 5555.66] + - - [3025, 64, 1, 363] + - [682, 4392.3] + - - [784, 32, 32, 192] + - [653, 5633.8] + - - [3136, 512, 1, 1024] + - [649, 7553.14] + - - [6272, 16, 1, 480] + - [684, 3219.85] + - - [1225, 64, 32, 288] + - [644, 8240.58] + - - [64, 256, 1, 1536] + - [677, 1456.36] + - - [289, 192, 32, 768] + - [656, 7372.8] + - - [2048, 448, 1, 1280] + - [639, 8403.01] + - - [3136, 2048, 1, 512] + - [638, 9486.31] + - - [289, 256, 1, 2016] + - [684, 3876.08] + - - [289, 384, 32, 1024] + - [623, 7350.54] + - - [1568, 32, 1, 832] + - [673, 2717.87] + - - [3136, 64, 32, 64] + - [626, 7657.26] + - - [289, 160, 1, 1120] + - [680, 2826.9] + - - [6272, 128, 1, 528] + - [634, 6926.26] + - - [21609, 32, 1, 288] + - [635, 3698.9] + - - [1225, 192, 1, 1728] + - [668, 7309.81] + - - [4096, 512, 1, 4096] + - [651, 10272.1] + - - [64, 256, 1, 1152] + - [677, 1387.82] + - - [6272, 96, 1, 480] + - [665, 6371.56] + - - [784, 96, 1, 800] + - [685, 3330.27] + - - [2048, 448, 1, 2048] + - [639, 8622.65] + - - [784, 96, 32, 192] + - [654, 7092.36] + - - [3136, 64, 64, 256] + - [647, 9579.16] + - - [289, 224, 1, 1344] + - [684, 3180.01] + - - [1001, 512, 1, 4096] + - [625, 8195.07] + - - [2048, 192, 1, 1280] + - [630, 6120.09] + - - [1225, 64, 32, 256] + - [635, 8076.62] + - - [2048, 256, 1, 1536] + - [625, 8137.7] + - - [1225, 64, 1, 1200] + - [684, 3552.87] + - - [6272, 128, 1, 512] + - [638, 6878.21] + - - [729, 192, 1, 1600] + - [683, 5016.77] + - - [289, 192, 1, 896] + - [681, 3091.87] + - - [1568, 384, 1, 832] + - [664, 6934.62] + - - [784, 16, 32, 192] + - [655, 3380.28] + - - [1568, 256, 1, 832] + - [629, 5980.86] + - - [1568, 48, 1, 832] + - [686, 3275.09] + - - [1568, 192, 1, 832] + - [624, 4441.11] + - - [289, 192, 32, 1024] + - [627, 6563.06] + - - [6272, 32, 1, 528] + - [668, 4998.67] + - - [49, 128, 1, 1200] + - [669, 550.175] + - - [1225, 64, 32, 384] + - [641, 8589.33] + - - [289, 128, 1, 896] + - [680, 2103.1] + - - [1568, 160, 1, 832] + - [668, 6995.05] + - - [1001, 32, 1, 1024] + - [677, 1744.72] + - - [2048, 320, 1, 2048] + - [662, 7118.04] + - - [2048, 384, 1, 1536] + - [625, 8184.01] + - - [50176, 512, 1, 256] + - [637, 9852.4] + - - [289, 256, 1, 1792] + - [686, 3809.75] + - - [64, 448, 1, 1152] + - [678, 2128.23] + - - [5041, 96, 1, 576] + - [663, 5279.3] + - - [6272, 192, 1, 480] + - [625, 7479.65] + - - [784, 32, 32, 256] + - [652, 5708.91] + - - [1001, 32, 1, 2048] + - [679, 2141.04] + - - [289, 192, 1, 1120] + - [675, 3277.77] + - - [6272, 32, 1, 512] + - [667, 4978.7] + - - [289, 384, 1, 3456] + - [684, 5904.14] + - - [289, 384, 1, 2592] + - [685, 5707.34] + - - [784, 128, 64, 512] + - [631, 8864.39] + - - [12544, 1024, 1, 512] + - [646, 10008.3] + - - [12544, 256, 1, 512] + - [664, 8628.08] + - - [6272, 24, 1, 512] + - [668, 3568.07] + - - [5041, 192, 1, 720] + - [639, 8424.42] + - - [64, 320, 1, 1728] + - [672, 1469.66] + - - [784, 128, 32, 256] + - [640, 8104.14] + - - [289, 96, 1, 864] + - [678, 1838.25] + - - [1225, 32, 32, 192] + - [659, 5949.72] + - - [1568, 128, 1, 832] + - [667, 5718.69] + - - [289, 128, 32, 768] + - [625, 7289.25] + - - [3136, 256, 64, 64] + - [633, 9103.92] + - - [196, 64, 1, 800] + - [671, 915.62] + - - [4096, 512, 1, 9216] + - [648, 10351.4] + - - [12544, 64, 1, 147] + - [638, 5069.33] + - - [784, 32, 1, 400] + - [669, 1140.36] + - - [6272, 160, 1, 512] + - [629, 6140.08] + - - [1225, 48, 32, 288] + - [635, 5978.61] + - - [64, 320, 1, 2880] + - [676, 1920.0] + - - [1225, 64, 32, 192] + - [629, 7641.01] + - - [1001, 32, 1, 1536] + - [677, 2084.79] + - - [784, 64, 32, 256] + - [621, 6990.51] + - - [64, 384, 1, 1152] + - [678, 1862.6] + - - [784, 512, 64, 128] + - [632, 9025.95] + - - [3136, 512, 1, 2048] + - [650, 7764.3] + - - [6272, 144, 1, 512] + - [625, 5574.04] + - - [1225, 192, 32, 384] + - [639, 9373.83] + - - [64, 192, 1, 1728] + - [677, 1206.46] + - - [8192, 320, 1, 1280] + - [691, 9875.92] + - - [8192, 320, 1, 2048] + - [694, 9745.7] + - - [8192, 384, 1, 1280] + - [691, 10046.2] + - - [8192, 192, 1, 1280] + - [694, 9950.9] + - - [8192, 192, 1, 2048] + - [690, 9559.67] + - - [8192, 384, 1, 2048] + - [692, 9945.74] + - - [8192, 448, 1, 2048] + - [693, 9908.51] + - - [1001, 64, 1, 1536] + - [687, 3649.94] + - - [8192, 448, 1, 1280] + - [691, 9981.35] + - - [1001, 64, 1, 2048] + - [688, 3580.87] + - - [1001, 128, 1, 2048] + - [689, 5587.87] - - [704, 1024, 1, 128] - - [723, 3019.56] + - [797, 3019.56] - - [1024, 1024, 1, 3328] - - [761, 8162.65] + - [835, 8162.65] - - [4, 704, 1, 1280] - - [664, 319.646] + - [738, 319.646] - - [4, 1856, 1, 3328] - - [694, 550.614] + - [768, 550.614] - - [1856, 448, 1, 3328] - - [746, 6813.15] + - [820, 6813.15] - - [2944, 4288, 1, 1280] - - [755, 8975.86] + - [829, 8975.86] - - [2368, 64, 1, 3328] - - [669, 5482.33] + - [743, 5482.33] - - [1760, 32, 1, 1760] - - [708, 3860.04] + - [782, 3860.04] - - [2368, 5888, 1, 256] - - [752, 8656.83] + - [826, 8656.83] - - [5888, 1856, 1, 256] - - [742, 7881.53] + - [816, 7881.53] - - [64, 3584, 1, 1280] - - [678, 4835.43] + - [752, 4835.43] - - [512, 24000, 1, 1536] - - [749, 8666.0] + - [823, 8666.0] - - [128, 6784, 1, 3328] - - [746, 7062.35] + - [820, 7062.35] - - [5888, 1408, 1, 256] - - [759, 8130.32] + - [833, 8130.32] - - [5888, 1856, 1, 3328] - - [749, 8840.85] + - [823, 8840.85] - - [512, 4, 1, 512] - - [634, 170.323] + - [708, 170.323] - - [35, 1500, 1, 2560] - - [638, 2896.65] + - [712, 2896.65] - - [1856, 4288, 1, 256] - - [738, 8374.73] + - [812, 8374.73] - - [1024, 5056, 1, 128] - - [735, 3304.35] + - [809, 3304.35] - - [5056, 5056, 1, 3328] - - [749, 8905.53] + - [823, 8905.53] - - [1408, 5888, 1, 1280] - - [749, 9418.2] + - [823, 9418.2] - - [2368, 448, 1, 128] - - [723, 3075.07] + - [797, 3075.07] - - [6144, 6000, 1, 2560] - - [749, 9336.43] + - [823, 9336.43] - - [2368, 6784, 1, 128] - - [722, 4919.36] + - [796, 4919.36] - - [1024, 3584, 1, 3328] - - [740, 8071.17] + - [814, 8071.17] - - [512, 48000, 1, 2048] - - [749, 8763.16] + - [823, 8763.16] - - [1408, 64, 1, 128] - - [645, 805.57] + - [719, 805.57] - - [256, 4288, 1, 3328] - - [771, 6331.96] + - [845, 6331.96] - - [5888, 1408, 1, 1280] - - [739, 9226.27] + - [813, 9226.27] - - [704, 1856, 1, 3328] - - [765, 6309.5] + - [839, 6309.5] - - [1408, 4288, 1, 256] - - [749, 8374.6] + - [823, 8374.6] - - [1024, 2368, 1, 256] - - [746, 7341.12] + - [820, 7341.12] - - [64, 4, 1, 256] - - [689, 13.1032] + - [763, 13.1032] - - [1408, 1856, 1, 1280] - - [756, 8773.05] + - [830, 8773.05] - - [1408, 64, 1, 1280] - - [702, 4050.08] + - [776, 4050.08] - - [448, 1024, 1, 1280] - - [765, 6071.26] + - [839, 6071.26] - - [4096, 32, 1, 4096] - - [699, 5491.82] + - [773, 5491.82] - - [256, 1408, 1, 3328] - - [751, 5351.49] + - [825, 5351.49] - - [5056, 5056, 1, 1280] - - [759, 9408.67] + - [833, 9408.67] - - [448, 5056, 1, 256] - - [764, 6680.54] + - [838, 6680.54] - - [704, 1856, 1, 1280] - - [741, 7504.03] + - [815, 7504.03] - - [128, 5056, 1, 128] - - [656, 2316.58] + - [730, 2316.58] - - [2368, 128, 1, 256] - - [741, 3660.22] + - [815, 3660.22] - - [1856, 1408, 1, 128] - - [728, 3885.97] + - [802, 3885.97] - - [64, 5056, 1, 256] - - [751, 3318.91] + - [825, 3318.91] - - [6784, 256, 1, 3328] - - [749, 7590.64] + - [823, 7590.64] - - [1408, 3584, 1, 256] - - [738, 8276.92] + - [812, 8276.92] - - [4288, 448, 1, 256] - - [751, 7139.79] + - [825, 7139.79] - - [64, 704, 1, 128] - - [652, 375.567] + - [726, 375.567] - - [1024, 1856, 1, 128] - - [721, 2890.66] + - [795, 2890.66] - - [4288, 2944, 1, 1280] - - [755, 8981.45] + - [829, 8981.45] - - [704, 5056, 1, 1280] - - [741, 7684.72] + - [815, 7684.72] - - [2368, 704, 1, 3328] - - [756, 7070.14] + - [830, 7070.14] - - [256, 5888, 1, 256] - - [741, 7319.45] + - [815, 7319.45] - - [1856, 4288, 1, 3328] - - [739, 9238.69] + - [813, 9238.69] - - [256, 2944, 1, 256] - - [741, 6090.31] + - [815, 6090.31] - - [5888, 1024, 1, 256] - - [745, 8270.05] + - [819, 8270.05] - - [448, 64, 1, 1280] - - [698, 2493.32] + - [772, 2493.32] - - [3072, 64, 1, 1024] - - [681, 3149.77] + - [755, 3149.77] - - [3584, 4, 1, 1280] - - [783, 567.862] + - [857, 567.862] - - [2560, 16, 1, 2560] - - [690, 2887.15] + - [764, 2887.15] - - [2944, 64, 1, 256] - - [681, 2565.76] + - [755, 2565.76] - - [128, 4, 1, 1280] - - [784, 78.8692] + - [858, 78.8692] - - [1408, 2944, 1, 256] - - [745, 8337.3] + - [819, 8337.3] - - [256, 1856, 1, 1280] - - [771, 6267.35] + - [845, 6267.35] - - [6784, 5056, 1, 3328] - - [755, 9424.0] + - [829, 9424.0] - - [5056, 5056, 1, 256] - - [742, 8758.33] + - [816, 8758.33] - - [128, 256, 1, 256] - - [697, 1205.36] + - [771, 1205.36] - - [64, 1024, 1, 1280] - - [708, 3566.68] + - [782, 3566.68] - - [2944, 4, 1, 256] - - [661, 319.449] + - [735, 319.449] - - [704, 5056, 1, 128] - - [730, 4073.83] + - [804, 4073.83] - - [4, 2368, 1, 1280] - - [689, 496.992] + - [763, 496.992] - - [2368, 2944, 1, 1280] - - [738, 9085.55] + - [812, 9085.55] - - [448, 448, 1, 3328] - - [716, 5428.76] + - [790, 5428.76] - - [6784, 6784, 1, 1280] - - [755, 8727.03] + - [829, 8727.03] - - [1024, 256, 1, 3328] - - [765, 5499.42] + - [839, 5499.42] - - [1408, 4288, 1, 1280] - - [739, 9094.42] + - [813, 9094.42] - - [3584, 4288, 1, 1280] - - [742, 8703.88] + - [816, 8703.88] - - [512, 6000, 1, 2560] - - [745, 8474.56] + - [819, 8474.56] - - [2368, 704, 1, 1280] - - [751, 7651.59] + - [825, 7651.59] - - [5056, 4288, 1, 3328] - - [759, 8545.35] + - [833, 8545.35] - - [3584, 2368, 1, 3328] - - [747, 8797.88] + - [821, 8797.88] - - [5888, 6784, 1, 1280] - - [745, 8785.18] + - [819, 8785.18] - - [64, 704, 1, 1280] - - [668, 2783.48] + - [742, 2783.48] - - [4288, 256, 1, 256] - - [741, 6162.78] + - [815, 6162.78] - - [2944, 128, 1, 128] - - [643, 1951.33] + - [717, 1951.33] - - [6144, 32, 1, 2560] - - [702, 4589.05] + - [776, 4589.05] - - [6784, 448, 1, 1280] - - [746, 8674.31] + - [820, 8674.31] - - [2944, 5888, 1, 256] - - [759, 8991.76] + - [833, 8991.76] - - [64, 64, 1, 1280] - - [719, 712.448] + - [793, 712.448] - - [4288, 2944, 1, 256] - - [755, 8678.14] + - [829, 8678.14] - - [5888, 704, 1, 1280] - - [745, 8652.71] + - [819, 8652.71] - - [5056, 4, 1, 3328] - - [661, 650.772] + - [735, 650.772] - - [1856, 64, 1, 1280] - - [678, 4471.97] + - [752, 4471.97] - - [1760, 16, 1, 1760] - - [718, 2592.23] + - [792, 2592.23] - - [448, 5888, 1, 128] - - [728, 3823.03] + - [802, 3823.03] - - [5888, 64, 1, 3328] - - [710, 6013.22] + - [784, 6013.22] - - [2944, 256, 1, 3328] - - [751, 7791.45] + - [825, 7791.45] - - [1024, 64, 1, 128] - - [652, 592.516] + - [726, 592.516] - - [5056, 2368, 1, 1280] - - [738, 9260.53] + - [812, 9260.53] - - [448, 3584, 1, 1280] - - [759, 6771.34] + - [833, 6771.34] - - [6784, 5888, 1, 256] - - [753, 7933.39] + - [827, 7933.39] - - [64, 1024, 1, 3328] - - [702, 4783.08] + - [776, 4783.08] - - [704, 128, 1, 1280] - - [708, 3971.98] + - [782, 3971.98] - - [4, 3584, 1, 128] - - [777, 59.5238] + - [851, 59.5238] - - [1408, 448, 1, 1280] - - [751, 5902.17] + - [825, 5902.17] - - [1024, 1408, 1, 256] - - [746, 5272.94] + - [820, 5272.94] - - [2368, 2368, 1, 3328] - - [751, 8488.76] + - [825, 8488.76] - - [1856, 6784, 1, 128] - - [728, 4742.51] + - [802, 4742.51] - - [5056, 704, 1, 3328] - - [754, 7772.48] + - [828, 7772.48] - - [1408, 1856, 1, 256] - - [772, 5229.84] + - [846, 5229.84] - - [1408, 704, 1, 3328] - - [772, 6954.93] + - [846, 6954.93] - - [2368, 5056, 1, 256] - - [745, 8580.68] + - [819, 8580.68] - - [1408, 256, 1, 1280] - - [771, 4790.11] + - [845, 4790.11] - - [3072, 128, 1, 1024] - - [767, 4579.87] + - [841, 4579.87] - - [3584, 2368, 1, 1280] - - [738, 8675.13] + - [812, 8675.13] - - [4288, 64, 1, 3328] - - [717, 5550.11] + - [791, 5550.11] - - [2368, 4, 1, 1280] - - [783, 537.518] + - [857, 537.518] - - [704, 5888, 1, 256] - - [739, 5305.88] + - [813, 5305.88] - - [6784, 2944, 1, 128] - - [735, 4344.21] + - [809, 4344.21] - - [6784, 64, 1, 256] - - [765, 4496.42] + - [839, 4496.42] - - [2944, 256, 1, 256] - - [751, 6553.7] + - [825, 6553.7] - - [2944, 6784, 1, 3328] - - [739, 8895.76] + - [813, 8895.76] - - [128, 1, 1, 1408] - - [719, 25.7] + - [793, 25.7] - - [704, 1408, 1, 3328] - - [753, 7913.21] + - [827, 7913.21] - - [3584, 704, 1, 3328] - - [738, 7526.43] + - [812, 7526.43] - - [2944, 256, 1, 128] - - [722, 2830.76] + - [796, 2830.76] - - [6784, 4, 1, 1280] - - [779, 645.235] + - [853, 645.235] - - [1024, 64, 1, 1280] - - [677, 3013.25] + - [751, 3013.25] - - [8448, 4, 1, 2816] - - [629, 984.768] + - [703, 984.768] - - [448, 4288, 1, 256] - - [751, 7139.79] + - [825, 7139.79] - - [64, 3584, 1, 3328] - - [675, 5683.27] + - [749, 5683.27] - - [704, 2368, 1, 1280] - - [759, 7045.3] + - [833, 7045.3] - - [1856, 2368, 1, 1280] - - [756, 8327.9] + - [830, 8327.9] - - [2368, 128, 1, 3328] - - [692, 6082.65] + - [766, 6082.65] - - [64, 193600, 1, 64] - - [741, 6747.77] + - [815, 6747.77] - - [1760, 128, 1, 1760] - - [669, 5513.07] + - [743, 5513.07] - - [448, 1408, 1, 256] - - [751, 5591.54] + - [825, 5591.54] - - [1856, 4288, 1, 1280] - - [749, 8647.72] + - [823, 8647.72] - - [64, 5056, 1, 3328] - - [709, 6096.59] + - [783, 6096.59] - - [512, 1500, 1, 2816] - - [751, 7879.3] + - [825, 7879.3] - - [1024, 448, 1, 128] - - [723, 1844.33] + - [797, 1844.33] - - [704, 4, 1, 1280] - - [689, 341.433] + - [763, 341.433] - - [704, 256, 1, 128] - - [723, 1001.34] + - [797, 1001.34] - - [256, 193600, 1, 64] - - [759, 8113.3] + - [833, 8113.3] - - [704, 2944, 1, 128] - - [730, 3747.13] + - [804, 3747.13] - - [1408, 1024, 1, 1280] - - [756, 7080.71] + - [830, 7080.71] - - [704, 6784, 1, 256] - - [774, 6630.47] + - [848, 6630.47] - - [6784, 704, 1, 256] - - [741, 8005.86] + - [815, 8005.86] - - [5056, 1408, 1, 128] - - [732, 4303.13] + - [806, 4303.13] - - [2048, 7000, 1, 2048] - - [749, 9269.2] + - [823, 9269.2] - - [256, 3584, 1, 3328] - - [743, 7334.48] + - [817, 7334.48] - - [5056, 704, 1, 256] - - [751, 7954.12] + - [825, 7954.12] - - [128, 1408, 1, 128] - - [646, 1243.02] + - [720, 1243.02] - - [3584, 4288, 1, 3328] - - [775, 7683.81] + - [849, 7683.81] - - [5888, 1856, 1, 1280] - - [739, 8831.34] + - [813, 8831.34] - - [256, 1408, 1, 256] - - [741, 4352.68] + - [815, 4352.68] - - [5056, 64, 1, 1280] - - [708, 5012.05] + - [782, 5012.05] - - [1024, 704, 1, 256] - - [741, 5710.17] + - [815, 5710.17] - - [64, 256, 1, 128] - - [647, 149.897] + - [721, 149.897] - - [2368, 3584, 1, 1280] - - [749, 8609.68] + - [823, 8609.68] - - [1024, 256, 1, 256] - - [765, 3276.9] + - [839, 3276.9] - - [1856, 4, 1, 1280] - - [663, 497.104] + - [737, 497.104] - - [448, 448, 1, 256] - - [751, 3117.83] + - [825, 3117.83] - - [2944, 3584, 1, 3328] - - [739, 8879.45] + - [813, 8879.45] - - [7680, 32, 1, 2560] - - [709, 5310.24] + - [783, 5310.24] - - [128, 4288, 1, 128] - - [649, 2116.2] + - [723, 2116.2] - - [256, 256, 1, 3328] - - [702, 4774.7] + - [776, 4774.7] - - [128, 1024, 1, 3328] - - [703, 5894.8] + - [777, 5894.8] - - [4, 1408, 1, 3328] - - [694, 552.674] + - [768, 552.674] - - [196, 256, 64, 1024] - - [792, 5218.34] + - [866, 5218.34] - - [6784, 2944, 1, 256] - - [757, 8271.18] + - [831, 8271.18] - - [64, 1856, 1, 1280] - - [708, 4167.96] + - [782, 4167.96] - - [64, 1024, 1, 128] - - [642, 589.188] + - [716, 589.188] - - [1024, 1500, 1, 2560] - - [746, 8407.88] + - [820, 8407.88] - - [1856, 2368, 1, 256] - - [741, 8092.15] + - [815, 8092.15] - - [3584, 256, 1, 128] - - [724, 2607.57] + - [798, 2607.57] - - [3584, 6784, 1, 3328] - - [758, 8558.83] + - [832, 8558.83] - - [256, 1024, 1, 256] - - [751, 3901.78] + - [825, 3901.78] - - [4, 6784, 1, 3328] - - [689, 662.575] + - [763, 662.575] - - [1024, 5888, 1, 3328] - - [749, 9161.76] + - [823, 9161.76] - - [1024, 128, 1, 1280] - - [706, 3942.12] + - [780, 3942.12] - - [3072, 32, 1, 1024] - - [683, 2840.49] + - [757, 2840.49] - - [6144, 24000, 1, 2560] - - [739, 7605.87] + - [813, 7605.87] - - [448, 1024, 1, 256] - - [741, 5062.19] + - [815, 5062.19] - - [5056, 4288, 1, 1280] - - [749, 9090.99] + - [823, 9090.99] - - [5888, 64, 1, 256] - - [751, 4449.78] + - [825, 4449.78] - - [1856, 256, 1, 1280] - - [765, 5834.46] + - [839, 5834.46] - - [64, 5888, 1, 3328] - - [703, 6152.44] + - [777, 6152.44] - - [2368, 2368, 1, 1280] - - [743, 8594.66] + - [817, 8594.66] - - [2944, 5888, 1, 128] - - [728, 4776.19] + - [802, 4776.19] - - [704, 5888, 1, 1280] - - [743, 8435.91] + - [817, 8435.91] - - [2368, 3584, 1, 128] - - [725, 4590.71] + - [799, 4590.71] - - [1856, 5056, 1, 128] - - [736, 4503.48] + - [810, 4503.48] - - [4608, 1, 1, 1536] - - [634, 226.955] + - [708, 226.955] - - [448, 256, 1, 3328] - - [678, 5415.56] + - [752, 5415.56] - - [2944, 6784, 1, 1280] - - [762, 8385.11] + - [836, 8385.11] - - [448, 1856, 1, 128] - - [732, 2618.96] + - [806, 2618.96] - - [128, 1024, 1, 128] - - [641, 940.527] + - [715, 940.527] - - [7680, 4, 1, 2560] - - [665, 985.104] + - [739, 985.104] - - [1024, 704, 1, 1280] - - [751, 7204.56] + - [825, 7204.56] - - [128, 5888, 1, 256] - - [741, 6313.52] + - [815, 6313.52] - - [1024, 5056, 1, 1280] - - [746, 8979.76] + - [820, 8979.76] - - [4288, 1024, 1, 256] - - [738, 7198.29] + - [812, 7198.29] - - [2944, 2368, 1, 128] - - [723, 4624.57] + - [797, 4624.57] - - [704, 704, 1, 3328] - - [764, 5870.71] + - [838, 5870.71] - - [704, 1408, 1, 1280] - - [753, 7680.32] + - [827, 7680.32] - - [5888, 448, 1, 1280] - - [741, 7718.66] + - [815, 7718.66] - - [3584, 256, 1, 3328] - - [746, 7523.88] + - [820, 7523.88] - - [704, 5888, 1, 3328] - - [751, 8196.99] + - [825, 8196.99] - - [704, 1856, 1, 128] - - [729, 3388.43] + - [803, 3388.43] - - [128, 3584, 1, 3328] - - [703, 6626.5] + - [777, 6626.5] - - [4, 4288, 1, 128] - - [776, 159.648] + - [850, 159.648] - - [128, 704, 1, 1280] - - [666, 4038.73] + - [740, 4038.73] - - [3584, 2944, 1, 256] - - [739, 7685.99] + - [813, 7685.99] - - [1856, 128, 1, 3328] - - [695, 6070.63] + - [769, 6070.63] - - [1856, 2368, 1, 3328] - - [756, 8460.62] + - [830, 8460.62] - - [512, 6000, 1, 2816] - - [759, 9019.55] + - [833, 9019.55] - - [2944, 448, 1, 128] - - [722, 3027.73] + - [796, 3027.73] - - [64, 193600, 1, 256] - - [765, 7080.32] + - [839, 7080.32] - - [128, 2944, 1, 1280] - - [741, 5397.87] + - [815, 5397.87] - - [448, 2944, 1, 1280] - - [751, 6996.97] + - [825, 6996.97] - - [512, 24000, 1, 2048] - - [759, 8832.67] + - [833, 8832.67] - - [128, 256, 1, 3328] - - [698, 3531.57] + - [772, 3531.57] - - [1408, 5056, 1, 3328] - - [754, 7969.94] + - [828, 7969.94] - - [1856, 1856, 1, 3328] - - [741, 8140.34] + - [815, 8140.34] - - [3584, 128, 1, 256] - - [751, 4861.05] + - [825, 4861.05] - - [448, 1408, 1, 3328] - - [741, 6353.75] + - [815, 6353.75] - - [2368, 2368, 1, 256] - - [755, 8369.37] + - [829, 8369.37] - - [4288, 4288, 1, 1280] - - [745, 8666.52] + - [819, 8666.52] - - [64, 448, 1, 1280] - - [698, 2591.92] + - [772, 2591.92] - - [5888, 1024, 1, 1280] - - [738, 8526.6] + - [812, 8526.6] - - [704, 1024, 1, 256] - - [751, 4971.8] + - [825, 4971.8] - - [1024, 12544, 1, 256] - - [789, 8611.9] + - [863, 8611.9] - - [448, 4, 1, 256] - - [694, 78.6534] + - [768, 78.6534] - - [5888, 448, 1, 128] - - [725, 3592.03] + - [799, 3592.03] - - [512, 48000, 1, 2560] - - [759, 9237.44] + - [833, 9237.44] - - [8448, 16, 1, 2816] - - [624, 3360.21] + - [698, 3360.21] - - [704, 6784, 1, 3328] - - [760, 7774.95] + - [834, 7774.95] - - [5888, 5888, 1, 1280] - - [746, 9238.25] + - [820, 9238.25] - - [5056, 1024, 1, 1280] - - [774, 8227.88] + - [848, 8227.88] - - [448, 5888, 1, 3328] - - [749, 7777.63] + - [823, 7777.63] - - [3072, 2, 1, 1024] - - [686, 376.383] + - [760, 376.383] - - [1024, 2944, 1, 1280] - - [739, 8650.45] + - [813, 8650.45] - - [5056, 5888, 1, 1280] - - [749, 8861.6] + - [823, 8861.6] - - [4288, 5888, 1, 128] - - [729, 5049.01] + - [803, 5049.01] - - [256, 3584, 1, 256] - - [741, 6314.11] + - [815, 6314.11] - - [256, 4, 1, 1280] - - [785, 163.94] + - [859, 163.94] - - [1408, 3584, 1, 128] - - [729, 4290.22] + - [803, 4290.22] - - [256, 2944, 1, 3328] - - [751, 7620.99] + - [825, 7620.99] - - [448, 3584, 1, 128] - - [729, 3353.9] + - [803, 3353.9] - - [5888, 2944, 1, 1280] - - [739, 9498.31] + - [813, 9498.31] - - [4, 6784, 1, 1280] - - [689, 623.916] + - [763, 623.916] - - [2368, 5888, 1, 128] - - [728, 4840.29] + - [802, 4840.29] - - [35, 8457, 1, 1760] - - [635, 4059.88] + - [709, 4059.88] - - [64, 2944, 1, 128] - - [646, 1310.82] + - [720, 1310.82] - - [2368, 4, 1, 256] - - [780, 369.739] + - [854, 369.739] - - [3584, 5888, 1, 256] - - [757, 7996.33] + - [831, 7996.33] - - [2368, 1024, 1, 128] - - [723, 3915.07] + - [797, 3915.07] - - [2368, 704, 1, 128] - - [723, 3658.97] + - [797, 3658.97] - - [512, 32, 1, 512] - - [712, 1127.6] + - [786, 1127.6] - - [3584, 2368, 1, 128] - - [723, 4462.48] + - [797, 4462.48] - - [5056, 704, 1, 128] - - [722, 4062.21] + - [796, 4062.21] - - [448, 2368, 1, 128] - - [723, 2829.07] + - [797, 2829.07] - - [4, 5056, 1, 256] - - [671, 425.868] + - [745, 425.868] - - [5056, 1408, 1, 3328] - - [756, 8848.92] + - [830, 8848.92] - - [1408, 704, 1, 256] - - [751, 5394.56] + - [825, 5394.56] - - [6784, 1024, 1, 3328] - - [738, 9232.02] + - [812, 9232.02] - - [6784, 2944, 1, 3328] - - [749, 8714.84] + - [823, 8714.84] - - [7680, 1, 1, 2560] - - [685, 248.845] + - [759, 248.845] - - [1856, 1856, 1, 256] - - [750, 7586.58] + - [824, 7586.58] - - [64, 64, 1, 3328] - - [720, 1363.25] + - [794, 1363.25] - - [512, 1, 1, 512] - - [634, 43.2158] + - [708, 43.2158] - - [6784, 2368, 1, 1280] - - [751, 8665.74] + - [825, 8665.74] - - [4608, 2, 1, 1536] - - [634, 452.65] + - [708, 452.65] - - [4288, 3584, 1, 256] - - [759, 8936.7] + - [833, 8936.7] - - [4288, 5888, 1, 1280] - - [756, 8957.15] + - [830, 8957.15] - - [4608, 4, 1, 1536] - - [627, 846.737] + - [701, 846.737] - - [1024, 6000, 1, 1536] - - [749, 8398.54] + - [823, 8398.54] - - [8448, 32, 1, 2816] - - [709, 5343.07] + - [783, 5343.07] - - [448, 2944, 1, 3328] - - [756, 7247.04] + - [830, 7247.04] - - [4288, 1856, 1, 1280] - - [739, 8902.86] + - [813, 8902.86] - - [1856, 2944, 1, 3328] - - [751, 8622.86] + - [825, 8622.86] - - [256, 6784, 1, 3328] - - [751, 8050.77] + - [825, 8050.77] - - [512, 3000, 1, 1536] - - [772, 7108.12] + - [846, 7108.12] - - [64, 5888, 1, 256] - - [764, 3567.74] + - [838, 3567.74] - - [256, 5056, 1, 128] - - [731, 3041.12] + - [805, 3041.12] - - [5056, 1024, 1, 256] - - [755, 8401.47] + - [829, 8401.47] - - [704, 64, 1, 3328] - - [714, 4299.02] + - [788, 4299.02] - - [5056, 1856, 1, 3328] - - [759, 8660.77] + - [833, 8660.77] - - [4, 2944, 1, 3328] - - [689, 618.637] + - [763, 618.637] - - [512, 1500, 1, 2048] - - [771, 5481.22] + - [845, 5481.22] - - [1024, 1, 1, 500000] - - [625, 260.061] + - [699, 260.061] - - [256, 4, 1, 256] - - [689, 50.5123] + - [763, 50.5123] - - [6784, 128, 1, 3328] - - [743, 6950.91] + - [817, 6950.91] - - [4288, 1408, 1, 128] - - [723, 4539.58] + - [797, 4539.58] - - [1856, 5888, 1, 3328] - - [749, 8712.93] + - [823, 8712.93] - - [4288, 5056, 1, 256] - - [755, 8997.15] + - [829, 8997.15] - - [1408, 128, 1, 1280] - - [678, 4599.12] + - [752, 4599.12] - - [4096, 7000, 1, 4096] - - [745, 8555.89] + - [819, 8555.89] - - [5056, 256, 1, 3328] - - [751, 8257.16] + - [825, 8257.16] - - [704, 704, 1, 256] - - [741, 5852.39] + - [815, 5852.39] - - [1024, 3000, 1, 2560] - - [738, 8258.84] + - [812, 8258.84] - - [1024, 5888, 1, 1280] - - [738, 8988.99] + - [812, 8988.99] - - [6784, 2368, 1, 128] - - [724, 4562.25] + - [798, 4562.25] - - [4, 5056, 1, 1280] - - [689, 600.441] + - [763, 600.441] - - [256, 64, 1, 1280] - - [712, 1899.69] + - [786, 1899.69] - - [128, 1856, 1, 1280] - - [751, 5185.76] + - [825, 5185.76] - - [1856, 1024, 1, 1280] - - [756, 7875.95] + - [830, 7875.95] - - [6784, 4288, 1, 1280] - - [759, 8981.18] + - [833, 8981.18] - - [1856, 1856, 1, 1280] - - [740, 7794.71] + - [814, 7794.71] - - [35, 1500, 1, 2048] - - [640, 2192.6] + - [714, 2192.6] - - [3072, 24000, 1, 1024] - - [752, 8690.58] + - [826, 8690.58] - - [1408, 5056, 1, 1280] - - [751, 8427.87] + - [825, 8427.87] - - [4, 2368, 1, 3328] - - [694, 594.422] + - [768, 594.422] - - [5888, 1856, 1, 128] - - [723, 4294.05] + - [797, 4294.05] - - [448, 704, 1, 1280] - - [746, 4136.39] + - [820, 4136.39] - - [448, 6784, 1, 128] - - [724, 3976.2] + - [798, 3976.2] - - [1024, 448, 1, 3328] - - [756, 6376.33] + - [830, 6376.33] - - [2944, 128, 1, 256] - - [741, 4466.26] + - [815, 4466.26] - - [5056, 3584, 1, 128] - - [729, 4997.18] + - [803, 4997.18] - - [5888, 5888, 1, 3328] - - [759, 8870.37] + - [833, 8870.37] - - [6784, 1024, 1, 256] - - [738, 8520.53] + - [812, 8520.53] - - [2944, 2368, 1, 256] - - [775, 6174.59] + - [849, 6174.59] - - [256, 448, 1, 256] - - [751, 1844.33] + - [825, 1844.33] - - [5056, 5888, 1, 3328] - - [740, 8076.65] + - [814, 8076.65] - - [1856, 1024, 1, 256] - - [751, 7188.92] + - [825, 7188.92] - - [512, 48000, 1, 1536] - - [762, 7282.2] + - [836, 7282.2] - - [3584, 448, 1, 1280] - - [741, 6869.1] + - [815, 6869.1] - - [1024, 1024, 1, 1280] - - [751, 8027.45] + - [825, 8027.45] - - [448, 5888, 1, 256] - - [741, 5765.84] + - [815, 5765.84] - - [2048, 128, 1, 2048] - - [699, 4835.01] + - [773, 4835.01] - - [1408, 6784, 1, 3328] - - [751, 8613.76] + - [825, 8613.76] - - [448, 1024, 1, 128] - - [722, 2315.57] + - [796, 2315.57] - - [4288, 704, 1, 128] - - [723, 4138.92] + - [797, 4138.92] - - [128, 1856, 1, 128] - - [658, 1397.56] + - [732, 1397.56] - - [448, 2368, 1, 3328] - - [741, 6786.48] + - [815, 6786.48] - - [5056, 64, 1, 128] - - [723, 1664.84] + - [797, 1664.84] - - [5056, 2944, 1, 256] - - [774, 7697.49] + - [848, 7697.49] - - [6784, 5888, 1, 128] - - [723, 5003.67] + - [797, 5003.67] - - [1024, 700, 1, 512] - - [751, 6036.31] + - [825, 6036.31] - - [3072, 1, 1, 128] - - [705, 70.3171] + - [779, 70.3171] - - [1024, 4, 1, 256] - - [663, 154.302] + - [737, 154.302] - - [2944, 704, 1, 128] - - [729, 3697.0] + - [803, 3697.0] - - [128, 6784, 1, 1280] - - [741, 6731.51] + - [815, 6731.51] - - [1408, 3584, 1, 3328] - - [739, 9258.07] + - [813, 9258.07] - - [2368, 6784, 1, 256] - - [738, 8840.4] + - [812, 8840.4] - - [5056, 1408, 1, 1280] - - [739, 9240.84] + - [813, 9240.84] - - [5056, 4288, 1, 128] - - [734, 4309.18] + - [808, 4309.18] - - [4, 704, 1, 256] - - [689, 130.697] + - [763, 130.697] - - [4288, 2368, 1, 3328] - - [752, 8755.33] + - [826, 8755.33] - - [1408, 1856, 1, 128] - - [722, 3918.75] + - [796, 3918.75] - - [1408, 5888, 1, 3328] - - [759, 8910.47] + - [833, 8910.47] - - [1856, 256, 1, 256] - - [741, 5631.34] + - [815, 5631.34] - - [6784, 6784, 1, 256] - - [749, 9298.76] + - [823, 9298.76] - - [5888, 5056, 1, 128] - - [724, 4811.36] + - [798, 4811.36] - - [4288, 2368, 1, 128] - - [723, 4749.1] + - [797, 4749.1] - - [128, 5888, 1, 1280] - - [750, 6393.86] + - [824, 6393.86] - - [256, 4288, 1, 1280] - - [741, 6887.79] + - [815, 6887.79] - - [2368, 2944, 1, 256] - - [755, 8314.82] + - [829, 8314.82] - - [4, 1856, 1, 256] - - [778, 267.03] + - [852, 267.03] - - [3584, 1856, 1, 1280] - - [739, 8631.91] + - [813, 8631.91] - - [6784, 6784, 1, 128] - - [729, 5059.96] + - [803, 5059.96] - - [256, 1856, 1, 128] - - [722, 1858.82] + - [796, 1858.82] - - [49, 512, 64, 2048] - - [793, 3053.67] + - [867, 3053.67] - - [704, 64, 1, 1280] - - [672, 2849.49] + - [746, 2849.49] - - [5888, 5056, 1, 256] - - [758, 8202.52] + - [832, 8202.52] - - [8448, 48000, 1, 2816] - - [749, 4281.94] + - [823, 4281.94] - - [512, 6000, 1, 2048] - - [741, 8047.89] + - [815, 8047.89] - - [3584, 448, 1, 256] - - [751, 6805.43] + - [825, 6805.43] - - [448, 4288, 1, 128] - - [729, 3500.83] + - [803, 3500.83] - - [7680, 64, 1, 2560] - - [684, 5957.9] + - [758, 5957.9] - - [256, 6784, 1, 256] - - [751, 7331.83] + - [825, 7331.83] - - [1408, 4288, 1, 128] - - [723, 4501.49] + - [797, 4501.49] - - [2944, 704, 1, 3328] - - [751, 8439.7] + - [825, 8439.7] - - [128, 448, 1, 256] - - [672, 1555.19] + - [746, 1555.19] - - [2048, 32, 1, 2048] - - [683, 3226.49] + - [757, 3226.49] - - [3584, 3584, 1, 256] - - [755, 8784.9] + - [829, 8784.9] - - [448, 1408, 1, 128] - - [722, 2535.92] + - [796, 2535.92] - - [128, 256, 1, 1280] - - [698, 2896.72] + - [772, 2896.72] - - [3584, 5056, 1, 256] - - [742, 8566.52] + - [816, 8566.52] - - [6784, 128, 1, 256] - - [741, 6053.97] + - [815, 6053.97] - - [4288, 4, 1, 256] - - [661, 428.9] + - [735, 428.9] - - [64, 1408, 1, 3328] - - [666, 5025.11] + - [740, 5025.11] - - [704, 448, 1, 256] - - [765, 3409.74] + - [839, 3409.74] - - [2944, 2368, 1, 1280] - - [739, 9066.35] + - [813, 9066.35] - - [448, 64, 1, 3328] - - [714, 3528.96] + - [788, 3528.96] - - [704, 6784, 1, 128] - - [728, 4212.61] + - [802, 4212.61] - - [3584, 4, 1, 3328] - - [781, 658.353] + - [855, 658.353] - - [6784, 3584, 1, 256] - - [749, 9061.84] + - [823, 9061.84] - - [704, 448, 1, 128] - - [728, 1552.8] + - [802, 1552.8] - - [256, 128, 1, 128] - - [653, 281.975] + - [727, 281.975] - - [704, 1408, 1, 128] - - [728, 3026.76] + - [802, 3026.76] - - [4, 448, 1, 128] - - [777, 5.56127] + - [851, 5.56127] - - [4288, 128, 1, 1280] - - [708, 5471.64] + - [782, 5471.64] - - [128, 1408, 1, 256] - - [751, 2813.35] + - [825, 2813.35] - - [4, 2944, 1, 256] - - [671, 316.766] + - [745, 316.766] - - [64, 128, 1, 3328] - - [719, 1872.56] + - [793, 1872.56] - - [1856, 1408, 1, 256] - - [741, 7735.89] + - [815, 7735.89] - - [5056, 2368, 1, 128] - - [723, 4830.19] + - [797, 4830.19] - - [2944, 2944, 1, 3328] - - [759, 8890.11] + - [833, 8890.11] - - [5056, 6784, 1, 256] - - [749, 9015.25] + - [823, 9015.25] - - [1856, 3584, 1, 128] - - [730, 4455.12] + - [804, 4455.12] - - [5888, 4, 1, 1280] - - [779, 642.063] + - [853, 642.063] - - [128, 2944, 1, 128] - - [648, 2037.03] + - [722, 2037.03] - - [35, 8457, 1, 2560] - - [636, 3988.23] + - [710, 3988.23] - - [3584, 6784, 1, 128] - - [723, 4774.54] + - [797, 4774.54] - - [128, 4288, 1, 256] - - [741, 4851.85] + - [815, 4851.85] - - [704, 448, 1, 3328] - - [756, 4432.63] + - [830, 4432.63] - - [2368, 6784, 1, 1280] - - [739, 9161.48] + - [813, 9161.48] - - [128, 128, 1, 3328] - - [713, 2839.99] + - [787, 2839.99] - - [5056, 1856, 1, 256] - - [755, 8380.94] + - [829, 8380.94] - - [256, 128, 1, 256] - - [697, 1165.18] + - [771, 1165.18] - - [1024, 3000, 1, 2816] - - [756, 8714.27] + - [830, 8714.27] - - [1024, 1856, 1, 256] - - [746, 7014.79] + - [820, 7014.79] - - [64, 1, 1, 1216] - - [719, 11.8205] + - [793, 11.8205] - - [4288, 64, 1, 128] - - [650, 1669.65] + - [724, 1669.65] - - [256, 448, 1, 3328] - - [674, 5152.39] + - [748, 5152.39] - - [1408, 6784, 1, 1280] - - [759, 8735.22] + - [833, 8735.22] - - [3584, 3584, 1, 1280] - - [756, 9020.09] + - [830, 9020.09] - - [7680, 24000, 1, 2560] - - [759, 6940.24] + - [833, 6940.24] - - [64, 2368, 1, 1280] - - [669, 4433.07] + - [743, 4433.07] - - [448, 2368, 1, 1280] - - [744, 5352.92] + - [818, 5352.92] - - [4608, 48000, 1, 1536] - - [738, 8129.11] + - [812, 8129.11] - - [5888, 5888, 1, 128] - - [731, 4700.91] + - [805, 4700.91] - - [64, 6784, 1, 3328] - - [741, 6170.82] + - [815, 6170.82] - - [2944, 256, 1, 1280] - - [771, 6177.65] + - [845, 6177.65] - - [2048, 16, 1, 2048] - - [693, 2167.7] + - [767, 2167.7] - - [256, 2368, 1, 128] - - [722, 2037.77] + - [796, 2037.77] - - [5056, 2368, 1, 3328] - - [739, 9040.6] + - [813, 9040.6] - - [2944, 4288, 1, 256] - - [770, 7552.22] + - [844, 7552.22] - - [1408, 3584, 1, 1280] - - [746, 8808.76] + - [820, 8808.76] - - [2368, 64, 1, 256] - - [682, 2320.51] + - [756, 2320.51] - - [1024, 128, 1, 128] - - [642, 1075.56] + - [716, 1075.56] - - [704, 128, 1, 3328] - - [675, 4985.02] + - [749, 4985.02] - - [5888, 4, 1, 128] - - [776, 33.6558] + - [850, 33.6558] - - [1856, 704, 1, 256] - - [751, 7110.98] + - [825, 7110.98] - - [1024, 1500, 1, 2816] - - [746, 8499.88] + - [820, 8499.88] - - [8448, 1, 1, 2816] - - [629, 251.469] + - [703, 251.469] - - [1024, 4, 1, 3328] - - [785, 541.032] + - [859, 541.032] - - [1024, 6000, 1, 2048] - - [746, 8698.59] + - [820, 8698.59] - - [512, 24000, 1, 2560] - - [739, 8963.7] + - [813, 8963.7] - - [6144, 3000, 1, 2560] - - [762, 8761.97] + - [836, 8761.97] - - [2368, 6784, 1, 3328] - - [756, 8867.49] + - [830, 8867.49] - - [1856, 1408, 1, 1280] - - [743, 7908.53] + - [817, 7908.53] - - [1856, 448, 1, 1280] - - [756, 6544.01] + - [830, 6544.01] - - [6784, 704, 1, 128] - - [722, 4086.45] + - [796, 4086.45] - - [4, 4, 1, 256] - - [689, 0.852941] + - [763, 0.852941] - - [128, 5888, 1, 128] - - [646, 2582.25] + - [720, 2582.25] - - [5056, 2944, 1, 128] - - [726, 4579.17] + - [800, 4579.17] - - [1408, 5888, 1, 256] - - [738, 8810.77] + - [812, 8810.77] - - [704, 2944, 1, 1280] - - [753, 8420.9] + - [827, 8420.9] - - [4288, 64, 1, 1280] - - [678, 4906.15] + - [752, 4906.15] - - [256, 64, 1, 256] - - [680, 689.953] + - [754, 689.953] - - [1024, 1024, 1, 256] - - [756, 5528.01] + - [830, 5528.01] - - [704, 1856, 1, 256] - - [740, 4452.92] + - [814, 4452.92] - - [2560, 64, 1, 2560] - - [669, 4563.09] + - [743, 4563.09] - - [3584, 704, 1, 1280] - - [746, 7898.77] + - [820, 7898.77] - - [256, 128, 1, 1280] - - [698, 2865.06] + - [772, 2865.06] - - [5888, 2368, 1, 256] - - [745, 8628.37] + - [819, 8628.37] - - [256, 2368, 1, 1280] - - [741, 6073.57] + - [815, 6073.57] - - [2944, 6784, 1, 128] - - [722, 4756.77] + - [796, 4756.77] - - [3584, 448, 1, 3328] - - [741, 7265.07] + - [815, 7265.07] - - [1408, 4, 1, 256] - - [782, 234.157] + - [856, 234.157] - - [704, 2368, 1, 3328] - - [739, 7248.98] + - [813, 7248.98] - - [2944, 448, 1, 256] - - [746, 6365.89] + - [820, 6365.89] - - [1856, 448, 1, 128] - - [724, 2976.34] + - [798, 2976.34] - - [4608, 6000, 1, 1536] - - [759, 9469.42] + - [833, 9469.42] - - [2368, 128, 1, 1280] - - [708, 4773.39] + - [782, 4773.39] - - [256, 5888, 1, 128] - - [723, 3112.0] + - [797, 3112.0] - - [64, 6784, 1, 256] - - [741, 3755.14] + - [815, 3755.14] - - [64, 5056, 1, 1280] - - [702, 4935.6] + - [776, 4935.6] - - [4, 6784, 1, 128] - - [777, 111.142] + - [851, 111.142] - - [3025, 64, 64, 64] - - [791, 6643.75] + - [865, 6643.75] - - [2944, 2944, 1, 1280] - - [739, 8869.55] + - [813, 8869.55] - - [5056, 448, 1, 3328] - - [772, 6706.2] + - [846, 6706.2] - - [4, 3584, 1, 1280] - - [689, 573.54] + - [763, 573.54] - - [1408, 128, 1, 128] - - [641, 1293.19] + - [715, 1293.19] - - [6784, 704, 1, 3328] - - [756, 8368.33] + - [830, 8368.33] - - [128, 64, 1, 1280] - - [715, 1260.41] + - [789, 1260.41] - - [2368, 256, 1, 1280] - - [741, 6154.47] + - [815, 6154.47] - - [4, 448, 1, 3328] - - [694, 351.738] + - [768, 351.738] - - [5888, 4288, 1, 128] - - [723, 4340.99] + - [797, 4340.99] - - [4, 5888, 1, 256] - - [671, 428.318] + - [745, 428.318] - - [1408, 2944, 1, 3328] - - [738, 9400.85] + - [812, 9400.85] - - [3584, 704, 1, 128] - - [725, 3392.55] + - [799, 3392.55] - - [64, 1024, 1, 256] - - [672, 1762.41] + - [746, 1762.41] - - [2368, 448, 1, 1280] - - [765, 5972.58] + - [839, 5972.58] - - [128, 3584, 1, 256] - - [741, 5224.32] + - [815, 5224.32] - - [704, 448, 1, 1280] - - [741, 4566.86] + - [815, 4566.86] - - [448, 5056, 1, 128] - - [723, 3876.19] + - [797, 3876.19] - - [6144, 4, 1, 2560] - - [665, 948.751] + - [739, 948.751] - - [5056, 3584, 1, 256] - - [755, 8162.56] + - [829, 8162.56] - - [4288, 4288, 1, 256] - - [762, 7653.34] + - [836, 7653.34] - - [1408, 5056, 1, 128] - - [729, 4554.34] + - [803, 4554.34] - - [2944, 3584, 1, 128] - - [735, 4147.0] + - [809, 4147.0] - - [3584, 2368, 1, 256] - - [756, 8195.05] + - [830, 8195.05] - - [5888, 5056, 1, 1280] - - [755, 9413.43] + - [829, 9413.43] - - [128, 1024, 1, 1280] - - [708, 4433.83] + - [782, 4433.83] - - [8448, 24000, 1, 2816] - - [749, 5227.12] + - [823, 5227.12] - - [64, 704, 1, 256] - - [672, 1441.89] + - [746, 1441.89] - - [4288, 256, 1, 1280] - - [771, 5687.8] + - [845, 5687.8] - - [3584, 3584, 1, 3328] - - [746, 9183.63] + - [820, 9183.63] - - [704, 64, 1, 128] - - [650, 402.835] + - [724, 402.835] - - [3072, 1500, 1, 128] - - [745, 7395.08] + - [819, 7395.08] - - [2048, 3136, 1, 512] - - [787, 8447.3] + - [861, 8447.3] - - [3025, 256, 64, 64] - - [795, 8063.79] + - [869, 8063.79] - - [5888, 6784, 1, 256] - - [739, 9282.01] + - [813, 9282.01] - - [4288, 2944, 1, 3328] - - [739, 9153.87] + - [813, 9153.87] - - [2944, 64, 1, 128] - - [656, 1463.53] + - [730, 1463.53] - - [1024, 128, 1, 3328] - - [706, 5377.41] + - [780, 5377.41] - - [1024, 16, 1, 500000] - - [622, 3997.13] + - [696, 3997.13] - - [4288, 128, 1, 3328] - - [710, 6053.31] + - [784, 6053.31] - - [7680, 128, 1, 2560] - - [756, 7769.24] + - [830, 7769.24] - - [256, 5056, 1, 1280] - - [765, 7200.84] + - [839, 7200.84] - - [1408, 256, 1, 128] - - [733, 1671.74] + - [807, 1671.74] - - [2944, 5888, 1, 3328] - - [745, 8642.18] + - [819, 8642.18] - - [6784, 5888, 1, 1280] - - [759, 8871.15] + - [833, 8871.15] - - [3072, 1, 1, 1024] - - [705, 205.972] + - [779, 205.972] - - [704, 128, 1, 256] - - [668, 1935.39] + - [742, 1935.39] - - [5888, 4288, 1, 1280] - - [746, 9176.7] + - [820, 9176.7] - - [1024, 24000, 1, 2048] - - [745, 8667.79] + - [819, 8667.79] - - [448, 256, 1, 1280] - - [678, 4327.95] + - [752, 4327.95] - - [5888, 3584, 1, 128] - - [723, 4669.45] + - [797, 4669.45] - - [64, 4288, 1, 3328] - - [703, 5375.04] + - [777, 5375.04] - - [448, 4, 1, 1280] - - [694, 289.716] + - [768, 289.716] - - [6784, 6784, 1, 3328] - - [752, 8306.73] + - [826, 8306.73] - - [5056, 4, 1, 1280] - - [664, 607.199] + - [738, 607.199] - - [4, 5888, 1, 3328] - - [689, 651.538] + - [763, 651.538] - - [256, 1408, 1, 1280] - - [741, 5177.09] + - [815, 5177.09] - - [3072, 16, 1, 1024] - - [700, 2207.63] + - [774, 2207.63] - - [704, 3584, 1, 128] - - [733, 3653.51] + - [807, 3653.51] - - [1024, 2, 1, 512] - - [720, 156.138] + - [794, 156.138] - - [5888, 448, 1, 3328] - - [741, 7896.85] + - [815, 7896.85] - - [2368, 4288, 1, 1280] - - [738, 8517.63] + - [812, 8517.63] - - [4288, 2944, 1, 128] - - [727, 4439.26] + - [801, 4439.26] - - [256, 64, 1, 3328] - - [713, 2704.76] + - [787, 2704.76] - - [2944, 64, 1, 3328] - - [678, 5647.15] + - [752, 5647.15] - - [6784, 64, 1, 3328] - - [751, 6434.61] + - [825, 6434.61] - - [5056, 2944, 1, 3328] - - [762, 8497.2] + - [836, 8497.2] - - [448, 128, 1, 256] - - [680, 1516.64] + - [754, 1516.64] - - [2944, 3584, 1, 256] - - [756, 8365.83] + - [830, 8365.83] - - [1408, 1408, 1, 3328] - - [739, 8440.42] + - [813, 8440.42] - - [1856, 128, 1, 1280] - - [741, 5242.93] + - [815, 5242.93] - - [3584, 3584, 1, 128] - - [723, 4385.94] + - [797, 4385.94] - - [64, 3584, 1, 256] - - [741, 3276.9] + - [815, 3276.9] - - [1408, 4, 1, 3328] - - [664, 605.504] + - [738, 605.504] - - [128, 2944, 1, 3328] - - [709, 6295.75] + - [783, 6295.75] - - [3584, 704, 1, 256] - - [746, 7711.64] + - [820, 7711.64] - - [2944, 448, 1, 3328] - - [757, 6503.97] + - [831, 6503.97] - - [1024, 2, 1, 500000] - - [626, 521.803] + - [700, 521.803] - - [3584, 1408, 1, 3328] - - [748, 8296.2] + - [822, 8296.2] - - [704, 3584, 1, 1280] - - [753, 7670.65] + - [827, 7670.65] - - [1024, 1408, 1, 128] - - [728, 2830.61] + - [802, 2830.61] - - [1856, 6784, 1, 256] - - [759, 8149.67] + - [833, 8149.67] - - [4288, 448, 1, 3328] - - [740, 7406.44] + - [814, 7406.44] - - [6784, 4288, 1, 128] - - [735, 4418.09] + - [809, 4418.09] - - [6784, 704, 1, 1280] - - [756, 8302.45] + - [830, 8302.45] - - [6144, 1, 1, 2560] - - [665, 243.427] + - [739, 243.427] - - [3584, 6784, 1, 256] - - [738, 9036.59] + - [812, 9036.59] - - [6144, 16, 1, 2560] - - [672, 3266.69] + - [746, 3266.69] - - [3584, 64, 1, 128] - - [656, 1555.19] + - [730, 1555.19] - - [5888, 1024, 1, 3328] - - [746, 8888.08] + - [820, 8888.08] - - [448, 64, 1, 128] - - [642, 248.074] + - [716, 248.074] - - [704, 6784, 1, 1280] - - [742, 7892.56] + - [816, 7892.56] - - [4, 448, 1, 256] - - [664, 70.8951] + - [738, 70.8951] - - [196, 1024, 64, 256] - - [790, 6630.86] + - [864, 6630.86] - - [5888, 128, 1, 256] - - [740, 5715.09] + - [814, 5715.09] - - [4096, 16, 1, 4096] - - [686, 3251.5] + - [760, 3251.5] - - [1856, 5056, 1, 3328] - - [755, 8740.27] + - [829, 8740.27] - - [4, 6784, 1, 256] - - [778, 360.412] + - [852, 360.412] - - [1024, 3584, 1, 128] - - [723, 3456.27] + - [797, 3456.27] - - [64, 704, 1, 3328] - - [691, 3817.47] + - [765, 3817.47] - - [2368, 2944, 1, 128] - - [729, 4605.47] + - [803, 4605.47] - - [5056, 64, 1, 256] - - [741, 3863.79] + - [815, 3863.79] - - [512, 1500, 1, 1536] - - [741, 6801.56] + - [815, 6801.56] - - [512, 1, 1, 500000] - - [630, 261.068] + - [704, 261.068] - - [5888, 2944, 1, 3328] - - [745, 8501.88] + - [819, 8501.88] - - [128, 3584, 1, 1280] - - [746, 5938.64] + - [820, 5938.64] - - [1024, 704, 1, 128] - - [732, 2172.29] + - [806, 2172.29] - - [1408, 2368, 1, 128] - - [728, 4023.2] + - [802, 4023.2] - - [5888, 2368, 1, 128] - - [729, 4424.62] + - [803, 4424.62] - - [128, 5056, 1, 3328] - - [741, 6692.16] + - [815, 6692.16] - - [3584, 6784, 1, 1280] - - [739, 9488.64] + - [813, 9488.64] - - [4288, 1856, 1, 256] - - [749, 8287.52] + - [823, 8287.52] - - [1856, 5888, 1, 256] - - [760, 7707.83] + - [834, 7707.83] - - [256, 256, 1, 256] - - [707, 1613.29] + - [781, 1613.29] - - [4288, 4288, 1, 3328] - - [749, 8923.59] + - [823, 8923.59] - - [1024, 1024, 1, 128] - - [729, 2553.71] + - [803, 2553.71] - - [4288, 1408, 1, 1280] - - [749, 8930.47] + - [823, 8930.47] - - [3584, 5056, 1, 128] - - [733, 4495.15] + - [807, 4495.15] - - [4, 1024, 1, 3328] - - [689, 415.694] + - [763, 415.694] - - [4, 704, 1, 128] - - [777, 13.9634] + - [851, 13.9634] - - [4288, 2368, 1, 256] - - [774, 7135.08] + - [848, 7135.08] - - [2944, 5056, 1, 1280] - - [746, 9118.61] + - [820, 9118.61] - - [448, 6784, 1, 256] - - [770, 5430.31] + - [844, 5430.31] - - [64, 128, 1, 128] - - [653, 83.057] + - [727, 83.057] - - [1856, 2368, 1, 128] - - [729, 4422.75] + - [803, 4422.75] - - [6784, 2368, 1, 3328] - - [742, 8769.4] + - [816, 8769.4] - - [1408, 6784, 1, 128] - - [729, 4739.0] + - [803, 4739.0] - - [256, 1024, 1, 1280] - - [751, 5722.21] + - [825, 5722.21] - - [704, 4, 1, 128] - - [777, 8.66578] + - [851, 8.66578] - - [1408, 4, 1, 128] - - [777, 26.1439] + - [851, 26.1439] - - [4288, 128, 1, 256] - - [751, 4865.38] + - [825, 4865.38] - - [4288, 1856, 1, 3328] - - [738, 9250.04] + - [812, 9250.04] - - [3584, 448, 1, 128] - - [729, 3029.59] + - [803, 3029.59] - - [64, 4288, 1, 128] - - [646, 1535.38] + - [720, 1535.38] - - [64, 448, 1, 3328] - - [716, 3457.36] + - [790, 3457.36] - - [448, 4, 1, 3328] - - [694, 367.328] + - [768, 367.328] - - [256, 4, 1, 3328] - - [785, 320.389] + - [859, 320.389] - - [4, 1408, 1, 1280] - - [782, 344.039] + - [856, 344.039] - - [3584, 64, 1, 1280] - - [670, 5191.07] + - [744, 5191.07] - - [1408, 448, 1, 128] - - [730, 2218.24] + - [804, 2218.24] - - [3584, 1024, 1, 1280] - - [752, 8253.11] + - [826, 8253.11] - - [1856, 5056, 1, 256] - - [770, 7552.55] + - [844, 7552.55] - - [4, 3584, 1, 256] - - [689, 325.456] + - [763, 325.456] - - [6784, 4288, 1, 3328] - - [745, 8655.34] + - [819, 8655.34] - - [4, 2944, 1, 1280] - - [689, 547.821] + - [763, 547.821] - - [1024, 4288, 1, 256] - - [746, 7788.83] + - [820, 7788.83] - - [5888, 3584, 1, 3328] - - [749, 9173.39] + - [823, 9173.39] - - [1856, 4, 1, 256] - - [780, 282.919] + - [854, 282.919] - - [4, 256, 1, 256] - - [689, 49.7485] + - [763, 49.7485] - - [5056, 3584, 1, 3328] - - [755, 8457.53] + - [829, 8457.53] - - [1408, 128, 1, 3328] - - [709, 5714.52] + - [783, 5714.52] - - [4, 64, 1, 1280] - - [785, 42.7667] + - [859, 42.7667] - - [2368, 1408, 1, 1280] - - [746, 8224.92] + - [820, 8224.92] - - [5056, 2944, 1, 1280] - - [738, 9295.13] + - [812, 9295.13] - - [8448, 6000, 1, 2816] - - [742, 8037.97] + - [816, 8037.97] - - [4, 4, 1, 128] - - [777, 0.1433898] + - [851, 0.1433898] - - [3584, 256, 1, 256] - - [741, 6116.79] + - [815, 6116.79] - - [3584, 2944, 1, 1280] - - [738, 8796.49] + - [812, 8796.49] - - [1024, 6784, 1, 256] - - [745, 8187.86] + - [819, 8187.86] - - [4, 128, 1, 256] - - [689, 30.4407] + - [763, 30.4407] - - [6784, 448, 1, 256] - - [741, 7862.3] + - [815, 7862.3] - - [5124, 9124, 1, 2048] - - [743, 8176.41] + - [817, 8176.41] - - [2944, 5056, 1, 3328] - - [738, 9328.34] + - [812, 9328.34] - - [6784, 4, 1, 128] - - [776, 204.9] + - [850, 204.9] - - [2944, 1408, 1, 128] - - [727, 3838.2] + - [801, 3838.2] - - [448, 128, 1, 3328] - - [692, 4632.16] + - [766, 4632.16] - - [64, 2944, 1, 3328] - - [709, 5663.47] + - [783, 5663.47] - - [5056, 6784, 1, 3328] - - [745, 8420.17] + - [819, 8420.17] - - [704, 2368, 1, 128] - - [729, 3321.79] + - [803, 3321.79] - - [3072, 1500, 1, 1024] - - [746, 8221.77] + - [820, 8221.77] - - [128, 2944, 1, 256] - - [741, 4550.52] + - [815, 4550.52] - - [128, 6784, 1, 128] - - [646, 2767.76] + - [720, 2767.76] - - [3584, 4288, 1, 256] - - [745, 8808.64] + - [819, 8808.64] - - [448, 1856, 1, 256] - - [750, 5166.63] + - [824, 5166.63] - - [1856, 6784, 1, 3328] - - [742, 8339.76] + - [816, 8339.76] - - [3584, 128, 1, 3328] - - [751, 6791.57] + - [825, 6791.57] - - [64, 1856, 1, 256] - - [673, 2210.03] + - [747, 2210.03] - - [64, 448, 1, 256] - - [705, 1008.35] + - [779, 1008.35] - - [5888, 4288, 1, 256] - - [745, 8869.63] + - [819, 8869.63] - - [128, 1500, 1, 1280] - - [702, 4733.54] + - [776, 4733.54] - - [5056, 1408, 1, 256] - - [743, 7523.31] + - [817, 7523.31] - - [35, 8457, 1, 4096] - - [636, 4023.17] + - [710, 4023.17] - - [64, 256, 1, 1280] - - [697, 1941.91] + - [771, 1941.91] - - [2944, 4, 1, 128] - - [776, 95.7426] + - [850, 95.7426] - - [3584, 1024, 1, 256] - - [768, 6553.68] + - [842, 6553.68] - - [512, 6000, 1, 1536] - - [742, 7357.25] + - [816, 7357.25] - - [256, 704, 1, 256] - - [741, 2912.81] + - [815, 2912.81] - - [5888, 5888, 1, 256] - - [752, 8802.7] + - [826, 8802.7] - - [4288, 1024, 1, 1280] - - [745, 8248.83] + - [819, 8248.83] - - [5888, 128, 1, 3328] - - [695, 6848.59] + - [769, 6848.59] - - [448, 6784, 1, 3328] - - [741, 8343.78] + - [815, 8343.78] - - [2944, 1408, 1, 1280] - - [738, 9229.48] + - [812, 9229.48] - - [3072, 6000, 1, 1024] - - [759, 9015.01] + - [833, 9015.01] - - [1024, 32, 1, 512] - - [680, 1498.07] + - [754, 1498.07] - - [2944, 1856, 1, 3328] - - [755, 7176.48] + - [829, 7176.48] - - [2368, 64, 1, 128] - - [646, 1206.48] + - [720, 1206.48] - - [256, 1024, 1, 128] - - [723, 1178.28] + - [797, 1178.28] - - [3584, 5888, 1, 1280] - - [745, 9023.58] + - [819, 9023.58] - - [64, 4, 1, 128] - - [777, 1.089372] + - [851, 1.089372] - - [6784, 1856, 1, 1280] - - [739, 8964.51] + - [813, 8964.51] - - [2944, 5056, 1, 256] - - [745, 8860.12] + - [819, 8860.12] - - [5888, 256, 1, 3328] - - [756, 8308.66] + - [830, 8308.66] - - [2944, 4288, 1, 128] - - [724, 4507.61] + - [798, 4507.61] - - [3584, 1408, 1, 256] - - [739, 8234.71] + - [813, 8234.71] - - [704, 3584, 1, 3328] - - [751, 7377.26] + - [825, 7377.26] - - [5056, 448, 1, 1280] - - [740, 7145.47] + - [814, 7145.47] - - [3584, 1856, 1, 3328] - - [756, 8954.81] + - [830, 8954.81] - - [64, 1408, 1, 128] - - [653, 731.974] + - [727, 731.974] - - [4288, 6784, 1, 1280] - - [745, 9166.55] + - [819, 9166.55] - - [1024, 3000, 1, 2048] - - [756, 7723.83] + - [830, 7723.83] - - [1408, 704, 1, 1280] - - [746, 7863.1] + - [820, 7863.1] - - [2944, 1024, 1, 256] - - [739, 5035.02] + - [813, 5035.02] - - [256, 64, 1, 128] - - [645, 150.757] + - [719, 150.757] - - [2368, 4288, 1, 3328] - - [743, 8568.84] + - [817, 8568.84] - - [4, 1408, 1, 256] - - [689, 219.885] + - [763, 219.885] - - [1024, 1408, 1, 1280] - - [771, 6761.13] + - [845, 6761.13] - - [64, 64, 1, 256] - - [671, 198.694] + - [745, 198.694] - - [704, 256, 1, 3328] - - [741, 4291.62] + - [815, 4291.62] - - [6784, 5056, 1, 256] - - [740, 8545.02] + - [814, 8545.02] - - [1856, 1856, 1, 128] - - [728, 4034.93] + - [802, 4034.93] - - [4288, 5888, 1, 256] - - [759, 8998.05] + - [833, 8998.05] - - [4, 704, 1, 3328] - - [694, 452.4] + - [768, 452.4] - - [35, 8457, 1, 2048] - - [637, 3375.37] + - [711, 3375.37] - - [448, 2944, 1, 256] - - [741, 6346.74] + - [815, 6346.74] - - [4, 4288, 1, 3328] - - [694, 630.978] + - [768, 630.978] - - [2944, 6784, 1, 256] - - [768, 8002.92] + - [842, 8002.92] - - [2944, 2944, 1, 128] - - [723, 4661.41] + - [797, 4661.41] - - [4, 4, 1, 1280] - - [694, 3.14762] + - [768, 3.14762] - - [1856, 3584, 1, 1280] - - [738, 8677.66] + - [812, 8677.66] - - [64, 2944, 1, 256] - - [741, 2926.95] + - [815, 2926.95] - - [3584, 1408, 1, 1280] - - [752, 8238.9] + - [826, 8238.9] - - [448, 256, 1, 128] - - [653, 1042.72] + - [727, 1042.72] - - [4288, 448, 1, 128] - - [729, 3698.82] + - [803, 3698.82] - - [5056, 256, 1, 1280] - - [746, 7058.5] + - [820, 7058.5] - - [1856, 1408, 1, 3328] - - [743, 8348.35] + - [817, 8348.35] - - [128, 128, 1, 128] - - [653, 145.736] + - [727, 145.736] - - [1024, 4288, 1, 3328] - - [739, 8042.61] + - [813, 8042.61] - - [448, 2368, 1, 256] - - [751, 5935.0] + - [825, 5935.0] - - [1024, 4, 1, 128] - - [777, 15.93] + - [851, 15.93] - - [64, 1408, 1, 1280] - - [675, 3865.49] + - [749, 3865.49] - - [64, 6784, 1, 1280] - - [771, 5629.61] + - [845, 5629.61] - - [5056, 448, 1, 256] - - [741, 7637.91] + - [815, 7637.91] - - [2944, 2368, 1, 3328] - - [749, 9112.44] + - [823, 9112.44] - - [704, 4288, 1, 3328] - - [741, 7950.2] + - [815, 7950.2] - - [1408, 128, 1, 256] - - [741, 2898.17] + - [815, 2898.17] - - [1024, 1856, 1, 1280] - - [739, 8087.51] + - [813, 8087.51] - - [6784, 1856, 1, 256] - - [770, 7538.25] + - [844, 7538.25] - - [512, 48000, 1, 2816] - - [738, 9704.21] + - [812, 9704.21] - - [512, 3000, 1, 2816] - - [740, 7621.63] + - [814, 7621.63] - - [128, 2368, 1, 3328] - - [703, 6038.94] + - [777, 6038.94] - - [1024, 5888, 1, 256] - - [755, 8185.82] + - [829, 8185.82] - - [64, 2944, 1, 1280] - - [702, 4540.24] + - [776, 4540.24] - - [6784, 1408, 1, 256] - - [755, 8574.0] + - [829, 8574.0] - - [5056, 64, 1, 3328] - - [703, 6310.97] + - [777, 6310.97] - - [128, 704, 1, 128] - - [642, 696.618] + - [716, 696.618] - - [1408, 2368, 1, 256] - - [741, 4995.06] + - [815, 4995.06] - - [1408, 1408, 1, 256] - - [738, 7552.34] + - [812, 7552.34] - - [4, 64, 1, 128] - - [776, 1.90441] + - [850, 1.90441] - - [64, 128, 1, 1280] - - [715, 1272.64] + - [789, 1272.64] - - [1024, 8, 1, 500000] - - [623, 2013.23] + - [697, 2013.23] - - [4, 2368, 1, 128] - - [777, 49.9526] + - [851, 49.9526] - - [2368, 2368, 1, 128] - - [728, 4483.8] + - [802, 4483.8] - - [64, 5888, 1, 128] - - [645, 1957.67] + - [719, 1957.67] - - [5888, 4, 1, 3328] - - [778, 638.798] + - [852, 638.798] - - [6784, 1408, 1, 128] - - [723, 4715.61] + - [797, 4715.61] - - [1408, 5056, 1, 256] - - [755, 8557.67] + - [829, 8557.67] - - [512, 50176, 1, 128] - - [786, 8809.39] + - [860, 8809.39] - - [5056, 128, 1, 3328] - - [678, 6810.66] + - [752, 6810.66] - - [128, 128, 1, 1280] - - [712, 1899.69] + - [786, 1899.69] - - [512, 2, 1, 512] - - [632, 87.4813] + - [706, 87.4813] - - [448, 704, 1, 256] - - [751, 3765.97] + - [825, 3765.97] - - [4288, 3584, 1, 128] - - [736, 4563.77] + - [810, 4563.77] - - [2944, 128, 1, 3328] - - [678, 6507.45] + - [752, 6507.45] - - [128, 5056, 1, 1280] - - [741, 6557.85] + - [815, 6557.85] - - [3584, 5056, 1, 1280] - - [738, 9407.93] + - [812, 9407.93] - - [256, 448, 1, 1280] - - [702, 4096.1] + - [776, 4096.1] - - [704, 704, 1, 128] - - [728, 2374.31] + - [802, 2374.31] - - [5056, 4, 1, 128] - - [776, 125.52] + - [850, 125.52] - - [704, 256, 1, 1280] - - [751, 4016.23] + - [825, 4016.23] - - [64, 2368, 1, 3328] - - [708, 5159.29] + - [782, 5159.29] - - [1856, 1024, 1, 128] - - [728, 3356.47] + - [802, 3356.47] - - [1856, 64, 1, 128] - - [645, 945.644] + - [719, 945.644] - - [4096, 64, 1, 4096] - - [711, 6260.24] + - [785, 6260.24] - - [1024, 24000, 1, 1536] - - [755, 9368.5] + - [829, 9368.5] - - [704, 4288, 1, 256] - - [752, 7329.39] + - [826, 7329.39] - - [5888, 2368, 1, 1280] - - [741, 8624.71] + - [815, 8624.71] - - [6784, 1856, 1, 3328] - - [745, 9012.45] + - [819, 9012.45] - - [64, 128, 1, 256] - - [671, 374.591] + - [745, 374.591] - - [2368, 5888, 1, 1280] - - [739, 9045.76] + - [813, 9045.76] - - [5888, 256, 1, 1280] - - [756, 7999.17] + - [830, 7999.17] - - [4, 5888, 1, 1280] - - [689, 615.839] + - [763, 615.839] - - [704, 128, 1, 128] - - [645, 693.269] + - [719, 693.269] - - [1024, 4, 1, 1280] - - [784, 372.464] + - [858, 372.464] - - [2368, 1856, 1, 3328] - - [756, 8246.91] + - [830, 8246.91] - - [2368, 128, 1, 128] - - [646, 1963.53] + - [720, 1963.53] - - [2944, 704, 1, 256] - - [756, 7116.24] + - [830, 7116.24] - - [5056, 128, 1, 128] - - [649, 2519.49] + - [723, 2519.49] - - [2368, 1024, 1, 3328] - - [741, 7959.13] + - [815, 7959.13] - - [35, 700, 1, 2048] - - [637, 1766.86] + - [711, 1766.86] - - [256, 704, 1, 3328] - - [741, 4296.56] + - [815, 4296.56] - - [704, 3584, 1, 256] - - [740, 7441.61] + - [814, 7441.61] - - [704, 2944, 1, 3328] - - [757, 7195.81] + - [831, 7195.81] - - [6784, 1024, 1, 128] - - [728, 4509.18] + - [802, 4509.18] - - [256, 448, 1, 128] - - [653, 838.003] + - [727, 838.003] - - [448, 1024, 1, 3328] - - [751, 6515.65] + - [825, 6515.65] - - [2944, 1024, 1, 3328] - - [746, 8751.63] + - [820, 8751.63] - - [2944, 5056, 1, 128] - - [723, 4799.73] + - [797, 4799.73] - - [2368, 256, 1, 256] - - [740, 4754.67] + - [814, 4754.67] - - [1408, 6784, 1, 256] - - [768, 7477.09] + - [842, 7477.09] - - [6784, 1408, 1, 3328] - - [746, 8968.57] + - [820, 8968.57] - - [4288, 6784, 1, 128] - - [721, 4455.74] + - [795, 4455.74] - - [1408, 2944, 1, 128] - - [733, 3862.79] + - [807, 3862.79] - - [704, 64, 1, 256] - - [672, 1441.89] + - [746, 1441.89] - - [3072, 4, 1, 1024] - - [690, 711.803] + - [764, 711.803] - - [256, 2368, 1, 3328] - - [765, 5199.73] + - [839, 5199.73] - - [6784, 2944, 1, 1280] - - [749, 8914.45] + - [823, 8914.45] - - [4288, 1856, 1, 128] - - [729, 4683.3] + - [803, 4683.3] - - [1856, 2944, 1, 128] - - [723, 4589.34] + - [797, 4589.34] - - [6784, 448, 1, 128] - - [723, 3918.53] + - [797, 3918.53] - - [64, 3584, 1, 128] - - [654, 1468.11] + - [728, 1468.11] - - [448, 5056, 1, 1280] - - [746, 7561.4] + - [820, 7561.4] - - [4288, 5056, 1, 1280] - - [738, 9304.11] + - [812, 9304.11] - - [2368, 1856, 1, 128] - - [728, 4322.17] + - [802, 4322.17] - - [128, 448, 1, 1280] - - [708, 3336.48] + - [782, 3336.48] - - [4288, 704, 1, 256] - - [751, 7834.65] + - [825, 7834.65] - - [256, 3584, 1, 128] - - [724, 2500.96] + - [798, 2500.96] - - [5888, 704, 1, 256] - - [770, 7244.49] + - [844, 7244.49] - - [3584, 1024, 1, 128] - - [735, 3169.03] + - [809, 3169.03] - - [256, 5888, 1, 3328] - - [756, 7763.47] + - [830, 7763.47] - - [1408, 4288, 1, 3328] - - [738, 9273.8] + - [812, 9273.8] - - [6784, 4288, 1, 256] - - [746, 8825.2] + - [820, 8825.2] - - [4288, 256, 1, 128] - - [725, 2621.54] + - [799, 2621.54] - - [448, 1856, 1, 3328] - - [766, 5859.8] + - [840, 5859.8] - - [5888, 256, 1, 256] - - [756, 7124.84] + - [830, 7124.84] - - [1024, 4, 1, 500000] - - [621, 1030.2] + - [695, 1030.2] - - [6784, 1024, 1, 1280] - - [738, 9083.11] + - [812, 9083.11] - - [5888, 1024, 1, 128] - - [725, 4297.16] + - [799, 4297.16] - - [1024, 128, 1, 256] - - [741, 2086.82] + - [815, 2086.82] - - [512, 16, 1, 500000] - - [622, 3921.96] + - [696, 3921.96] - - [128, 64, 1, 3328] - - [712, 1969.97] + - [786, 1969.97] - - [448, 64, 1, 256] - - [697, 1092.37] + - [771, 1092.37] - - [2368, 256, 1, 128] - - [728, 2174.84] + - [802, 2174.84] - - [6784, 3584, 1, 1280] - - [738, 9558.82] + - [812, 9558.82] - - [1024, 6784, 1, 1280] - - [747, 8637.72] + - [821, 8637.72] - - [2944, 64, 1, 1280] - - [669, 4770.13] + - [743, 4770.13] - - [1408, 2944, 1, 1280] - - [738, 9238.47] + - [812, 9238.47] - - [256, 1856, 1, 256] - - [764, 4498.43] + - [838, 4498.43] - - [1408, 2368, 1, 3328] - - [746, 8344.97] + - [820, 8344.97] - - [2944, 4, 1, 3328] - - [781, 661.209] + - [855, 661.209] - - [128, 1408, 1, 3328] - - [709, 5641.42] + - [783, 5641.42] - - [2944, 1856, 1, 128] - - [723, 4488.04] + - [797, 4488.04] - - [256, 2944, 1, 128] - - [733, 2233.18] + - [807, 2233.18] - - [256, 6784, 1, 128] - - [722, 3139.9] + - [796, 3139.9] - - [2368, 4, 1, 128] - - [777, 38.7612] + - [851, 38.7612] - - [1408, 256, 1, 3328] - - [773, 4927.67] + - [847, 4927.67] - - [1856, 4, 1, 128] - - [777, 42.3719] + - [851, 42.3719] - - [1024, 16, 1, 512] - - [689, 1115.61] + - [763, 1115.61] - - [5056, 6784, 1, 128] - - [724, 4963.45] + - [798, 4963.45] - - [4288, 5056, 1, 128] - - [722, 4928.09] + - [796, 4928.09] - - [1856, 5888, 1, 128] - - [729, 4865.15] + - [803, 4865.15] - - [7680, 2, 1, 2560] - - [665, 499.612] + - [739, 499.612] - - [3584, 1856, 1, 256] - - [755, 7978.38] + - [829, 7978.38] - - [4288, 3584, 1, 1280] - - [755, 7852.26] + - [829, 7852.26] - - [2368, 448, 1, 256] - - [770, 5238.93] + - [844, 5238.93] - - [4288, 256, 1, 3328] - - [741, 6751.34] + - [815, 6751.34] - - [1856, 704, 1, 128] - - [723, 3525.56] + - [797, 3525.56] - - [1408, 64, 1, 256] - - [682, 1884.8] + - [756, 1884.8] - - [64, 1856, 1, 128] - - [659, 888.205] + - [733, 888.205] - - [4, 256, 1, 128] - - [776, 7.38178] + - [850, 7.38178] - - [512, 16, 1, 512] - - [689, 663.756] + - [763, 663.756] - - [704, 5888, 1, 128] - - [723, 4424.55] + - [797, 4424.55] - - [6784, 3584, 1, 128] - - [725, 3823.4] + - [799, 3823.4] - - [1024, 64, 1, 256] - - [667, 1379.81] + - [741, 1379.81] - - [64, 2368, 1, 256] - - [741, 2424.93] + - [815, 2424.93] - - [5124, 1500, 1, 2048] - - [759, 8391.84] + - [833, 8391.84] - - [4288, 5056, 1, 3328] - - [745, 9274.14] + - [819, 9274.14] - - [4, 1856, 1, 1280] - - [689, 453.474] + - [763, 453.474] - - [4288, 128, 1, 128] - - [723, 2157.8] + - [797, 2157.8] - - [512, 2, 1, 500000] - - [633, 516.895] + - [707, 516.895] - - [1408, 1408, 1, 128] - - [724, 3600.49] + - [798, 3600.49] - - [7680, 16, 1, 2560] - - [704, 3542.59] + - [778, 3542.59] - - [1856, 128, 1, 128] - - [656, 1532.8] + - [730, 1532.8] - - [5056, 2368, 1, 256] - - [768, 7684.07] + - [842, 7684.07] - - [4288, 704, 1, 3328] - - [741, 7642.96] + - [815, 7642.96] - - [448, 3584, 1, 256] - - [751, 6734.07] + - [825, 6734.07] - - [2368, 64, 1, 1280] - - [702, 3962.24] + - [776, 3962.24] - - [2368, 1024, 1, 1280] - - [753, 7989.64] + - [827, 7989.64] - - [2944, 1408, 1, 3328] - - [756, 8954.66] + - [830, 8954.66] - - [6144, 1500, 1, 2560] - - [774, 8170.07] + - [848, 8170.07] - - [4224, 1, 1, 128] - - [705, 76.9] + - [779, 76.9] - - [1024, 1408, 1, 3328] - - [771, 6961.38] + - [845, 6961.38] - - [2944, 5888, 1, 1280] - - [752, 8797.53] + - [826, 8797.53] - - [8448, 2, 1, 2816] - - [627, 496.958] + - [701, 496.958] - - [1408, 4, 1, 1280] - - [782, 471.891] + - [856, 471.891] - - [5888, 3584, 1, 256] - - [759, 8246.3] + - [833, 8246.3] - - [2368, 5056, 1, 128] - - [722, 4906.9] + - [796, 4906.9] - - [1408, 1856, 1, 3328] - - [746, 9006.8] + - [820, 9006.8] - - [4, 4, 1, 3328] - - [694, 5.83793] + - [768, 5.83793] - - [5888, 5056, 1, 3328] - - [759, 8545.1] + - [833, 8545.1] - - [7680, 6000, 1, 2560] - - [752, 7996.0] + - [826, 7996.0] - - [6784, 1408, 1, 1280] - - [746, 8888.13] + - [820, 8888.13] - - [4, 1024, 1, 1280] - - [694, 302.109] + - [768, 302.109] - - [512, 3000, 1, 2560] - - [746, 7809.43] + - [820, 7809.43] - - [704, 2944, 1, 256] - - [751, 4909.24] + - [825, 4909.24] - - [4288, 64, 1, 256] - - [751, 3264.72] + - [825, 3264.72] - - [6784, 5888, 1, 3328] - - [759, 9544.52] + - [833, 9544.52] - - [2368, 4288, 1, 128] - - [722, 4873.03] + - [796, 4873.03] - - [64, 4288, 1, 1280] - - [708, 4656.42] + - [782, 4656.42] - - [6784, 64, 1, 1280] - - [741, 6230.43] + - [815, 6230.43] - - [3584, 128, 1, 128] - - [649, 2315.57] + - [723, 2315.57] - - [1024, 6784, 1, 128] - - [723, 3758.94] + - [797, 3758.94] - - [1024, 1500, 1, 1536] - - [772, 6972.0] + - [846, 6972.0] - - [1408, 64, 1, 3328] - - [675, 5079.58] + - [749, 5079.58] - - [6784, 4, 1, 256] - - [661, 487.938] + - [735, 487.938] - - [1408, 1408, 1, 1280] - - [774, 7423.31] + - [848, 7423.31] - - [256, 2368, 1, 256] - - [741, 4986.9] + - [815, 4986.9] - - [3072, 3000, 1, 1024] - - [743, 7844.01] + - [817, 7844.01] - - [448, 4288, 1, 3328] - - [742, 7204.79] + - [816, 7204.79] - - [2368, 1408, 1, 256] - - [774, 5897.96] + - [848, 5897.96] - - [704, 2368, 1, 256] - - [741, 7000.93] + - [815, 7000.93] - - [1024, 24000, 1, 2560] - - [768, 8562.31] + - [842, 8562.31] - - [2944, 448, 1, 1280] - - [756, 7155.93] + - [830, 7155.93] - - [5888, 2368, 1, 3328] - - [755, 9252.42] + - [829, 9252.42] - - [1024, 256, 1, 128] - - [737, 1255.88] + - [811, 1255.88] - - [5124, 9124, 1, 1760] - - [749, 9168.49] + - [823, 9168.49] - - [448, 1408, 1, 1280] - - [741, 6150.34] + - [815, 6150.34] - - [448, 1856, 1, 1280] - - [756, 6489.76] + - [830, 6489.76] - - [4288, 448, 1, 1280] - - [771, 6887.02] + - [845, 6887.02] - - [5888, 704, 1, 3328] - - [751, 8230.64] + - [825, 8230.64] - - [4, 1856, 1, 128] - - [777, 27.0964] + - [851, 27.0964] - - [5056, 256, 1, 128] - - [722, 3469.01] + - [796, 3469.01] - - [1856, 256, 1, 128] - - [723, 2534.16] + - [797, 2534.16] - - [128, 2368, 1, 256] - - [741, 3660.22] + - [815, 3660.22] - - [704, 4, 1, 256] - - [689, 134.596] + - [763, 134.596] - - [1024, 6784, 1, 3328] - - [743, 8482.75] + - [817, 8482.75] - - [1408, 5888, 1, 128] - - [723, 4644.52] + - [797, 4644.52] - - [4288, 4, 1, 128] - - [776, 35.8799] + - [850, 35.8799] - - [512, 3136, 1, 2048] - - [788, 6386.69] + - [862, 6386.69] - - [1408, 1024, 1, 256] - - [741, 5440.82] + - [815, 5440.82] - - [128, 64, 1, 256] - - [671, 380.019] + - [745, 380.019] - - [8448, 1500, 1, 2816] - - [738, 9155.92] + - [812, 9155.92] - - [256, 704, 1, 128] - - [723, 895.623] + - [797, 895.623] - - [2560, 7000, 1, 2560] - - [750, 8565.66] + - [824, 8565.66] - - [5888, 64, 1, 1280] - - [765, 5007.83] + - [839, 5007.83] - - [128, 4, 1, 3328] - - [784, 165.21] + - [858, 165.21] - - [5056, 6784, 1, 1280] - - [749, 9331.48] + - [823, 9331.48] - - [1024, 448, 1, 1280] - - [751, 6501.46] + - [825, 6501.46] - - [704, 5056, 1, 3328] - - [738, 8090.13] + - [812, 8090.13] - - [128, 5056, 1, 256] - - [751, 5537.37] + - [825, 5537.37] - - [3584, 5056, 1, 3328] - - [747, 8633.24] + - [821, 8633.24] - - [1856, 4, 1, 3328] - - [785, 582.814] + - [859, 582.814] - - [4, 2944, 1, 128] - - [776, 114.292] + - [850, 114.292] - - [2368, 2944, 1, 3328] - - [755, 8749.55] + - [829, 8749.55] - - [448, 448, 1, 1280] - - [679, 4694.93] + - [753, 4694.93] - - [128, 4, 1, 128] - - [776, 4.94734] + - [850, 4.94734] - - [2368, 3584, 1, 256] - - [755, 8418.59] + - [829, 8418.59] - - [4608, 3000, 1, 1536] - - [745, 9076.47] + - [819, 9076.47] - - [1024, 256, 1, 1280] - - [751, 5562.84] + - [825, 5562.84] - - [5056, 3584, 1, 1280] - - [745, 8365.09] + - [819, 8365.09] - - [5124, 9124, 1, 4096] - - [755, 8648.58] + - [829, 8648.58] - - [7680, 48000, 1, 2560] - - [749, 4098.26] + - [823, 4098.26] - - [1856, 704, 1, 1280] - - [741, 8141.04] + - [815, 8141.04] - - [1856, 2944, 1, 1280] - - [743, 8214.4] + - [817, 8214.4] - - [4608, 1500, 1, 1536] - - [751, 8424.53] + - [825, 8424.53] - - [1024, 48000, 1, 2816] - - [742, 8513.18] + - [816, 8513.18] - - [5124, 9124, 1, 2560] - - [759, 8641.24] + - [833, 8641.24] - - [128, 1024, 1, 256] - - [673, 2356.45] + - [747, 2356.45] - - [2944, 1408, 1, 256] - - [755, 8254.29] + - [829, 8254.29] - - [4288, 1408, 1, 3328] - - [749, 9138.49] + - [823, 9138.49] - - [3584, 64, 1, 3328] - - [662, 5629.62] + - [736, 5629.62] - - [5888, 2944, 1, 128] - - [723, 4119.33] + - [797, 4119.33] - - [2944, 1024, 1, 128] - - [725, 4002.96] + - [799, 4002.96] - - [128, 1, 1, 1024] - - [719, 20.0805] + - [793, 20.0805] - - [5124, 700, 1, 2048] - - [756, 7653.84] + - [830, 7653.84] - - [4, 4288, 1, 1280] - - [689, 587.749] + - [763, 587.749] - - [6784, 5056, 1, 128] - - [728, 4855.85] + - [802, 4855.85] - - [256, 1024, 1, 3328] - - [751, 6116.28] + - [825, 6116.28] - - [3584, 4, 1, 256] - - [663, 395.576] + - [737, 395.576] - - [1856, 64, 1, 3328] - - [678, 5732.6] + - [752, 5732.6] - - [4, 128, 1, 3328] - - [784, 162.689] + - [858, 162.689] - - [256, 12544, 1, 1024] - - [788, 7628.92] + - [862, 7628.92] - - [5888, 1408, 1, 3328] - - [749, 9524.43] + - [823, 9524.43] - - [448, 2944, 1, 128] - - [723, 3163.91] + - [797, 3163.91] - - [2368, 1856, 1, 256] - - [751, 8167.36] + - [825, 8167.36] - - [256, 5056, 1, 256] - - [741, 7292.13] + - [815, 7292.13] - - [5056, 5056, 1, 128] - - [729, 5043.99] + - [803, 5043.99] - - [448, 3584, 1, 3328] - - [746, 6839.56] + - [820, 6839.56] - - [4, 5056, 1, 3328] - - [694, 639.886] + - [768, 639.886] - - [256, 256, 1, 128] - - [653, 554.902] + - [727, 554.902] - - [5888, 256, 1, 128] - - [725, 3562.47] + - [799, 3562.47] - - [4, 5056, 1, 128] - - [776, 149.907] + - [850, 149.907] - - [448, 256, 1, 256] - - [672, 2121.5] + - [746, 2121.5] - - [704, 4, 1, 3328] - - [782, 455.919] + - [856, 455.919] - - [1408, 256, 1, 256] - - [741, 4352.68] + - [815, 4352.68] - - [3584, 1856, 1, 128] - - [732, 3933.23] + - [806, 3933.23] - - [4288, 4288, 1, 128] - - [723, 4888.61] + - [797, 4888.61] - - [1856, 1024, 1, 3328] - - [759, 8242.64] + - [833, 8242.64] - - [1856, 4288, 1, 128] - - [728, 4647.4] + - [802, 4647.4] - - [1024, 6000, 1, 2560] - - [753, 8526.75] + - [827, 8526.75] - - [1024, 5056, 1, 256] - - [738, 7343.83] + - [812, 7343.83] - - [5056, 5888, 1, 128] - - [727, 4053.5] + - [801, 4053.5] - - [2368, 1408, 1, 3328] - - [741, 8466.2] + - [815, 8466.2] - - [1024, 48000, 1, 1536] - - [759, 9487.74] + - [833, 9487.74] - - [5888, 448, 1, 256] - - [772, 6081.54] + - [846, 6081.54] - - [5888, 6784, 1, 128] - - [724, 4820.27] + - [798, 4820.27] - - [2368, 4, 1, 3328] - - [783, 620.628] + - [857, 620.628] - - [6784, 5056, 1, 1280] - - [768, 8525.5] + - [842, 8525.5] - - [5056, 704, 1, 1280] - - [738, 7933.06] + - [812, 7933.06] - - [1024, 48000, 1, 2560] - - [759, 8877.94] + - [833, 8877.94] - - [4608, 32, 1, 1536] - - [688, 3556.83] + - [762, 3556.83] - - [1024, 2368, 1, 128] - - [731, 2943.75] + - [805, 2943.75] - - [128, 704, 1, 256] - - [672, 2059.8] + - [746, 2059.8] - - [2368, 448, 1, 3328] - - [751, 5290.42] + - [825, 5290.42] - - [128, 5888, 1, 3328] - - [751, 7764.43] + - [825, 7764.43] - - [448, 128, 1, 1280] - - [702, 3373.28] + - [776, 3373.28] - - [6784, 4, 1, 3328] - - [661, 676.063] + - [735, 676.063] - - [4288, 4, 1, 1280] - - [694, 564.775] + - [768, 564.775] - - [1024, 64, 1, 3328] - - [708, 4293.48] + - [782, 4293.48] - - [3072, 48000, 1, 1024] - - [758, 7826.51] + - [832, 7826.51] - - [256, 4, 1, 128] - - [777, 4.93304] + - [851, 4.93304] - - [1024, 5888, 1, 128] - - [736, 3610.46] + - [810, 3610.46] - - [3584, 5888, 1, 128] - - [724, 4722.35] + - [798, 4722.35] - - [5056, 5888, 1, 256] - - [759, 9159.11] + - [833, 9159.11] - - [2368, 1024, 1, 256] - - [751, 7482.71] + - [825, 7482.71] - - [2944, 1856, 1, 256] - - [755, 8209.0] + - [829, 8209.0] - - [1856, 6784, 1, 1280] - - [751, 8205.43] + - [825, 8205.43] - - [64, 5056, 1, 128] - - [646, 2079.35] + - [720, 2079.35] - - [64, 6784, 1, 128] - - [646, 2437.58] + - [720, 2437.58] - - [448, 704, 1, 128] - - [722, 1506.45] + - [796, 1506.45] - - [4, 1024, 1, 128] - - [777, 17.3463] + - [851, 17.3463] - - [1408, 448, 1, 256] - - [741, 5545.45] + - [815, 5545.45] - - [1408, 704, 1, 128] - - [727, 2931.65] + - [801, 2931.65] - - [64, 256, 1, 3328] - - [713, 2816.52] + - [787, 2816.52] - - [8448, 3000, 1, 2816] - - [747, 8872.99] + - [821, 8872.99] - - [6784, 448, 1, 3328] - - [741, 7555.48] + - [815, 7555.48] - - [5056, 1856, 1, 1280] - - [739, 8652.36] + - [813, 8652.36] - - [1408, 1024, 1, 3328] - - [743, 7781.42] + - [817, 7781.42] - - [2368, 256, 1, 3328] - - [747, 5392.06] + - [821, 5392.06] - - [7680, 1500, 1, 2560] - - [745, 8919.72] + - [819, 8919.72] - - [5888, 3584, 1, 1280] - - [745, 9235.85] + - [819, 9235.85] - - [1856, 3584, 1, 3328] - - [756, 8348.83] + - [830, 8348.83] - - [5888, 128, 1, 1280] - - [741, 5928.61] + - [815, 5928.61] - - [1024, 2944, 1, 256] - - [772, 6630.27] + - [846, 6630.27] - - [448, 6784, 1, 1280] - - [753, 8332.45] + - [827, 8332.45] - - [256, 3584, 1, 1280] - - [743, 7140.19] + - [817, 7140.19] - - [448, 128, 1, 128] - - [645, 552.813] + - [719, 552.813] - - [704, 5056, 1, 256] - - [751, 7959.68] + - [825, 7959.68] - - [3584, 1024, 1, 3328] - - [743, 8386.84] + - [817, 8386.84] - - [2944, 1856, 1, 1280] - - [759, 7670.29] + - [833, 7670.29] - - [128, 256, 1, 128] - - [660, 258.37] + - [734, 258.37] - - [5056, 256, 1, 256] - - [751, 5736.77] + - [825, 5736.77] - - [2944, 4288, 1, 3328] - - [738, 8730.8] + - [812, 8730.8] - - [2368, 3584, 1, 3328] - - [740, 8437.71] + - [814, 8437.71] - - [2944, 704, 1, 1280] - - [751, 8342.53] + - [825, 8342.53] - - [128, 4, 1, 256] - - [671, 24.9242] + - [745, 24.9242] - - [2944, 3584, 1, 1280] - - [753, 8322.11] + - [827, 8322.11] - - [1856, 5888, 1, 1280] - - [738, 8911.91] + - [812, 8911.91] - - [256, 256, 1, 1280] - - [702, 3653.67] + - [776, 3653.67] - - [4608, 24000, 1, 1536] - - [752, 8931.06] + - [826, 8931.06] - - [4288, 1408, 1, 256] - - [739, 8338.45] + - [813, 8338.45] - - [3584, 64, 1, 256] - - [751, 3414.07] + - [825, 3414.07] - - [64, 1856, 1, 3328] - - [678, 5460.23] + - [752, 5460.23] - - [256, 1408, 1, 128] - - [722, 1424.09] + - [796, 1424.09] - - [5888, 1408, 1, 128] - - [733, 4177.88] + - [807, 4177.88] - - [4288, 2368, 1, 1280] - - [742, 8596.05] + - [816, 8596.05] - - [4, 4288, 1, 256] - - [778, 370.954] + - [852, 370.954] - - [256, 4288, 1, 128] - - [723, 2907.99] + - [797, 2907.99] - - [256, 128, 1, 3328] - - [716, 3644.88] + - [790, 3644.88] - - [512, 8, 1, 500000] - - [628, 2025.89] + - [702, 2025.89] - - [6784, 2368, 1, 256] - - [741, 8470.41] + - [815, 8470.41] - - [5888, 128, 1, 128] - - [646, 2604.55] + - [720, 2604.55] - - [1408, 448, 1, 3328] - - [751, 6540.62] + - [825, 6540.62] - - [1024, 24000, 1, 2816] - - [768, 8364.03] + - [842, 8364.03] - - [704, 1024, 1, 1280] - - [751, 7277.28] + - [825, 7277.28] - - [1856, 256, 1, 3328] - - [741, 7039.14] + - [815, 7039.14] - - [1856, 2944, 1, 256] - - [750, 8151.59] + - [824, 8151.59] - - [5056, 1024, 1, 128] - - [724, 4422.82] + - [798, 4422.82] - - [64, 5888, 1, 1280] - - [702, 4854.62] + - [776, 4854.62] - - [7680, 3000, 1, 2560] - - [755, 8789.57] + - [829, 8789.57] - - [4224, 1500, 1, 176] - - [751, 7902.14] + - [825, 7902.14] - - [5124, 700, 1, 2560] - - [741, 8232.59] + - [815, 8232.59] - - [6784, 256, 1, 128] - - [722, 3548.92] + - [796, 3548.92] - - [5888, 704, 1, 128] - - [729, 3959.65] + - [803, 3959.65] - - [6784, 64, 1, 128] - - [657, 2150.82] + - [731, 2150.82] - - [4, 448, 1, 1280] - - [782, 268.063] + - [856, 268.063] - - [1024, 4288, 1, 1280] - - [756, 8363.72] + - [830, 8363.72] - - [2368, 5056, 1, 3328] - - [755, 8581.85] + - [829, 8581.85] - - [448, 4, 1, 128] - - [776, 16.8673] + - [850, 16.8673] - - [4, 256, 1, 3328] - - [785, 201.988] + - [859, 201.988] - - [4288, 1024, 1, 3328] - - [751, 8567.72] + - [825, 8567.72] - - [6144, 48000, 1, 2560] - - [759, 3751.68] + - [833, 3751.68] - - [1024, 5056, 1, 3328] - - [738, 9440.66] + - [812, 9440.66] - - [1024, 1856, 1, 3328] - - [759, 8244.36] + - [833, 8244.36] - - [704, 704, 1, 1280] - - [751, 5529.99] + - [825, 5529.99] - - [128, 2368, 1, 1280] - - [708, 5062.38] + - [782, 5062.38] - - [3584, 4, 1, 128] - - [777, 61.5949] + - [851, 61.5949] - - [3584, 256, 1, 1280] - - [775, 6260.24] + - [849, 6260.24] - - [4, 128, 1, 128] - - [776, 1.2587] + - [850, 1.2587] - - [128, 4288, 1, 3328] - - [687, 6186.15] + - [761, 6186.15] - - [5124, 1500, 1, 2560] - - [755, 8432.62] + - [829, 8432.62] - - [3584, 128, 1, 1280] - - [741, 6547.85] + - [815, 6547.85] - - [4, 256, 1, 1280] - - [694, 180.144] + - [768, 180.144] - - [128, 704, 1, 3328] - - [666, 5177.81] + - [740, 5177.81] - - [4288, 6784, 1, 256] - - [739, 9005.34] + - [813, 9005.34] - - [3584, 2944, 1, 3328] - - [756, 8872.27] + - [830, 8872.27] - - [128, 1856, 1, 256] - - [741, 3690.48] + - [815, 3690.48] - - [64, 4288, 1, 256] - - [741, 3007.57] + - [815, 3007.57] - - [4, 3584, 1, 3328] - - [671, 639.99] + - [745, 639.99] - - [64, 4, 1, 3328] - - [785, 98.7074] + - [859, 98.7074] - - [4, 64, 1, 3328] - - [785, 91.9069] + - [859, 91.9069] - - [35, 700, 1, 2560] - - [639, 2397.65] + - [713, 2397.65] - - [5888, 2944, 1, 256] - - [749, 9031.28] + - [823, 9031.28] - - [4, 2368, 1, 256] - - [689, 256.968] + - [763, 256.968] - - [1856, 64, 1, 256] - - [673, 2222.96] + - [747, 2222.96] - - [5056, 128, 1, 1280] - - [741, 6557.85] + - [815, 6557.85] - - [448, 4288, 1, 1280] - - [765, 6891.66] + - [839, 6891.66] - - [256, 4288, 1, 256] - - [741, 6250.51] + - [815, 6250.51] - - [1024, 4288, 1, 128] - - [725, 3951.41] + - [799, 3951.41] - - [4, 1024, 1, 256] - - [689, 182.144] + - [763, 182.144] - - [5056, 4288, 1, 256] - - [745, 8933.43] + - [819, 8933.43] - - [1024, 448, 1, 256] - - [751, 4573.33] + - [825, 4573.33] - - [1024, 3584, 1, 256] - - [746, 7447.18] + - [820, 7447.18] - - [2944, 128, 1, 1280] - - [751, 5417.27] + - [825, 5417.27] - - [49, 2048, 64, 512] - - [794, 5916.91] + - [868, 5916.91] - - [2560, 32, 1, 2560] - - [688, 4076.99] + - [762, 4076.99] - - [64, 256, 1, 256] - - [705, 689.953] + - [779, 689.953] - - [1024, 4, 1, 512] - - [697, 288.17] + - [771, 288.17] - - [128, 2368, 1, 128] - - [651, 1809.68] + - [725, 1809.68] - - [256, 704, 1, 1280] - - [741, 4033.08] + - [815, 4033.08] - - [64, 2368, 1, 128] - - [642, 1165.88] + - [716, 1165.88] - - [176, 1500, 1, 1408] - - [669, 4922.13] + - [743, 4922.13] - - [448, 5888, 1, 1280] - - [751, 7550.21] + - [825, 7550.21] - - [512, 3000, 1, 2048] - - [773, 6562.44] + - [847, 6562.44] - - [5056, 448, 1, 128] - - [723, 3947.97] + - [797, 3947.97] - - [4288, 704, 1, 1280] - - [741, 8243.82] + - [815, 8243.82] - - [3584, 2944, 1, 128] - - [733, 4284.88] + - [807, 4284.88] - - [6784, 256, 1, 1280] - - [741, 7955.21] + - [815, 7955.21] - - [256, 2944, 1, 1280] - - [771, 6691.9] + - [845, 6691.9] - - [2560, 128, 1, 2560] - - [709, 5347.23] + - [783, 5347.23] - - [2368, 5888, 1, 3328] - - [746, 8919.07] + - [820, 8919.07] - - [4, 64, 1, 256] - - [694, 13.1032] + - [768, 13.1032] - - [704, 1024, 1, 3328] - - [771, 6648.12] + - [845, 6648.12] - - [2368, 1856, 1, 1280] - - [757, 8016.51] + - [831, 8016.51] - - [448, 5056, 1, 3328] - - [741, 8231.73] + - [815, 8231.73] - - [128, 448, 1, 128] - - [650, 441.208] + - [724, 441.208] - - [128, 6784, 1, 256] - - [751, 5850.05] + - [825, 5850.05] - - [512, 4, 1, 500000] - - [631, 1027.14] + - [705, 1027.14] - - [3584, 4288, 1, 128] - - [727, 4260.9] + - [801, 4260.9] - - [64, 448, 1, 128] - - [650, 253.554] + - [724, 253.554] - - [1024, 6000, 1, 2816] - - [755, 8886.14] + - [829, 8886.14] - - [5888, 4288, 1, 3328] - - [755, 8968.16] + - [829, 8968.16] - - [2368, 704, 1, 256] - - [771, 4663.24] + - [845, 4663.24] - - [256, 1856, 1, 3328] - - [743, 6480.63] + - [817, 6480.63] - - [1856, 128, 1, 256] - - [741, 3726.66] + - [815, 3726.66] - - [6784, 128, 1, 128] - - [644, 2824.01] + - [718, 2824.01] - - [3584, 1408, 1, 128] - - [727, 3666.78] + - [801, 3666.78] - - [1856, 5056, 1, 1280] - - [738, 8651.36] + - [812, 8651.36] - - [2944, 1024, 1, 1280] - - [749, 8765.21] + - [823, 8765.21] - - [5056, 4, 1, 256] - - [663, 428.688] + - [737, 428.688] - - [3584, 5888, 1, 3328] - - [749, 9347.75] + - [823, 9347.75] - - [2368, 4288, 1, 256] - - [759, 8013.1] + - [833, 8013.1] - - [1024, 2368, 1, 3328] - - [746, 8119.29] + - [820, 8119.29] - - [128, 3584, 1, 128] - - [646, 2584.62] + - [720, 2584.62] - - [704, 1408, 1, 256] - - [751, 6792.27] + - [825, 6792.27] - - [4096, 128, 1, 4096] - - [773, 6624.84] + - [847, 6624.84] - - [1024, 2944, 1, 128] - - [725, 3771.37] + - [799, 3771.37] - - [1024, 3584, 1, 1280] - - [746, 8952.71] + - [820, 8952.71] - - [4288, 5888, 1, 3328] - - [759, 9048.05] + - [833, 9048.05] - - [4288, 4, 1, 3328] - - [664, 615.206] + - [738, 615.206] - - [4608, 16, 1, 1536] - - [668, 2894.94] + - [742, 2894.94] - - [5888, 64, 1, 128] - - [655, 1827.16] + - [729, 1827.16] - - [4, 5888, 1, 128] - - [776, 179.544] + - [850, 179.544] - - [1024, 2944, 1, 3328] - - [747, 8298.77] + - [821, 8298.77] - - [2048, 64, 1, 2048] - - [676, 4963.77] + - [750, 4963.77] - - [6144, 2, 1, 2560] - - [665, 477.88] + - [739, 477.88] - - [256, 6784, 1, 1280] - - [739, 7491.94] + - [813, 7491.94] - - [1856, 3584, 1, 256] - - [751, 7580.6] + - [825, 7580.6] - - [128, 448, 1, 3328] - - [702, 4417.71] + - [776, 4417.71] - - [6784, 1856, 1, 128] - - [730, 4621.74] + - [804, 4621.74] - - [1024, 1500, 1, 2048] - - [751, 6284.5] + - [825, 6284.5] - - [5056, 128, 1, 256] - - [751, 5705.16] + - [825, 5705.16] - - [512, 24000, 1, 2816] - - [738, 8919.85] + - [812, 8919.85] - - [256, 5888, 1, 1280] - - [753, 7978.0] + - [827, 7978.0] - - [4, 128, 1, 1280] - - [694, 94.2609] + - [768, 94.2609] - - [4288, 6784, 1, 3328] - - [759, 9012.58] + - [833, 9012.58] - - [6784, 128, 1, 1280] - - [743, 6807.35] + - [817, 6807.35] - - [64, 1408, 1, 256] - - [672, 2045.19] + - [746, 2045.19] - - [2368, 1408, 1, 128] - - [723, 4340.73] + - [797, 4340.73] - - [1856, 448, 1, 256] - - [772, 3639.99] + - [846, 3639.99] - - [1408, 1024, 1, 128] - - [731, 3417.68] + - [805, 3417.68] - - [128, 64, 1, 128] - - [652, 68.7241] + - [726, 68.7241] - - [6784, 3584, 1, 3328] - - [749, 9425.63] + - [823, 9425.63] - - [1760, 7000, 1, 1760] - - [746, 8780.41] + - [820, 8780.41] - - [1024, 704, 1, 3328] - - [763, 5644.6] + - [837, 5644.6] - - [64, 64, 1, 128] - - [642, 38.2023] + - [716, 38.2023] - - [2368, 5056, 1, 1280] - - [760, 8462.41] + - [834, 8462.41] - - [64, 4, 1, 1280] - - [694, 46.6455] + - [768, 46.6455] - - [1408, 2368, 1, 1280] - - [746, 8235.08] + - [820, 8235.08] - - [128, 1408, 1, 1280] - - [708, 4491.66] + - [782, 4491.66] - - [1024, 1, 1, 512] - - [712, 82.02] + - [786, 82.02] - - [4, 1408, 1, 128] - - [776, 56.42] + - [850, 56.42] - - [704, 4288, 1, 128] - - [730, 3942.96] + - [804, 3942.96] - - [128, 1856, 1, 3328] - - [696, 6111.93] + - [770, 6111.93] - - [2944, 2944, 1, 256] - - [755, 8640.22] + - [829, 8640.22] - - [2944, 4, 1, 1280] - - [689, 554.265] + - [763, 554.265] - - [5888, 4, 1, 256] - - [671, 435.744] + - [745, 435.744] - - [6784, 256, 1, 256] - - [751, 7025.96] + - [825, 7025.96] - - [256, 5056, 1, 3328] - - [751, 8249.57] + - [825, 8249.57] - - [128, 4288, 1, 1280] - - [741, 5561.74] + - [815, 5561.74] - - [5056, 1856, 1, 128] - - [735, 3975.28] + - [809, 3975.28] - - [1024, 3000, 1, 1536] - - [756, 8544.54] + - [830, 8544.54] - - [5056, 1024, 1, 3328] - - [749, 9361.47] + - [823, 9361.47] - - [128, 128, 1, 256] - - [701, 699.151] + - [775, 699.151] - - [1760, 64, 1, 1760] - - [669, 4956.26] + - [743, 4956.26] - - [4288, 3584, 1, 3328] - - [769, 7506.18] + - [843, 7506.18] - - [448, 704, 1, 3328] - - [741, 4697.66] + - [815, 4697.66] - - [448, 448, 1, 128] - - [658, 1249.62] + - [732, 1249.62] - - [1024, 2368, 1, 1280] - - [751, 7756.44] + - [825, 7756.44] - - [1856, 704, 1, 3328] - - [751, 8340.66] + - [825, 8340.66] - - [512, 1500, 1, 2560] - - [753, 6041.39] + - [827, 6041.39] - - [5888, 6784, 1, 3328] - - [749, 9199.38] + - [823, 9199.38] - - [704, 4288, 1, 1280] - - [743, 8342.06] + - [817, 8342.06] - - [128, 50176, 1, 512] - - [789, 7589.48] + - [863, 7589.48] - - [704, 256, 1, 256] - - [741, 2912.81] + - [815, 2912.81] - - [1024, 48000, 1, 2048] - - [746, 8947.42] + - [820, 8947.42] - - [4288, 1024, 1, 128] - - [722, 4291.75] + - [796, 4291.75] - - [3136, 64, 128, 64] - - [804, 8175.16] - - - [784, 512, 64, 128] - - [802, 8378.44] - - - [3136, 256, 64, 64] - - [805, 8506.75] - - - [12544, 1024, 1, 256] - - [798, 8928.03] + - [878, 8175.16] - - [784, 128, 128, 512] - - [803, 8190.63] + - [877, 8190.63] - - [784, 512, 256, 128] - - [801, 8637.24] - - - [3136, 64, 64, 256] - - [800, 8783.03] - - - [3136, 512, 1, 2048] - - [797, 7298.42] - - - [12544, 256, 1, 1024] - - [809, 7667.35] - - - [3136, 2048, 1, 512] - - [808, 8447.32] + - [875, 8637.24] - - [3136, 256, 256, 64] - - [801, 8663.18] + - [875, 8663.18] - - [3136, 64, 128, 256] - - [799, 8943.56] - - - [784, 128, 64, 512] - - [807, 8006.37] + - [873, 8943.56] - - [3136, 64, 256, 64] - - [804, 8267.22] + - [878, 8267.22] - - [784, 512, 128, 128] - - [801, 8564.35] - - - [3136, 64, 64, 64] - - [804, 8009.45] + - [875, 8564.35] - - [784, 128, 256, 512] - - [805, 8377.16] + - [879, 8377.16] - - [3136, 64, 256, 256] - - [806, 9033.98] + - [880, 9033.98] - - [3136, 256, 128, 64] - - [801, 8624.56] + - [875, 8624.56] - - [1024, 256, 1, 1024] - - [827, 6331.13] + - [901, 6331.13] - - [1024, 512, 1, 2048] - - [826, 8100.14] + - [900, 8100.14] - - [512, 200, 1, 512] - - [835, 2861.93] + - [909, 2861.93] - - [4096, 256, 1, 2048] - - [818, 8812.82] + - [892, 8812.82] - - [4096, 512, 1, 1024] - - [828, 9068.87] + - [902, 9068.87] - - [1024, 200, 1, 1024] - - [827, 5110.12] + - [901, 5110.12] - - [1024, 512, 1, 1024] - - [820, 7785.35] + - [894, 7785.35] - - [2048, 256, 1, 4096] - - [830, 8438.81] + - [904, 8438.81] - - [2048, 768, 1, 512] - - [812, 8618.53] + - [886, 8618.53] - - [512, 256, 1, 1024] - - [832, 4835.03] + - [906, 4835.03] - - [512, 768, 1, 2048] - - [829, 6909.04] + - [903, 6909.04] - - [2048, 256, 1, 1024] - - [825, 7941.98] + - [899, 7941.98] - - [1024, 256, 1, 2048] - - [822, 6997.9] + - [896, 6997.9] - - [2048, 200, 1, 512] - - [825, 5649.76] + - [899, 5649.76] - - [4096, 200, 1, 1024] - - [823, 6678.93] + - [897, 6678.93] - - [2048, 200, 1, 4096] - - [831, 6706.69] + - [905, 6706.69] - - [2048, 512, 1, 1024] - - [828, 8549.0] + - [902, 8549.0] - - [1024, 1024, 1, 512] - - [823, 8046.73] + - [897, 8046.73] - - [1024, 200, 1, 4096] - - [822, 5884.36] + - [896, 5884.36] - - [2048, 512, 1, 4096] - - [833, 8995.94] + - [907, 8995.94] - - [4096, 512, 1, 2048] - - [828, 9298.18] + - [902, 9298.18] - - [4096, 1024, 1, 2048] - - [810, 9790.77] + - [884, 9790.77] - - [2048, 1024, 1, 2048] - - [811, 9278.9] + - [885, 9278.9] - - [1024, 200, 1, 512] - - [827, 4535.46] + - [901, 4535.46] - - [1024, 1024, 1, 4096] - - [818, 8967.39] + - [892, 8967.39] - - [2048, 1024, 1, 4096] - - [813, 9500.56] + - [887, 9500.56] - - [4096, 200, 1, 2048] - - [819, 7082.68] + - [893, 7082.68] - - [2048, 200, 1, 1024] - - [825, 6212.04] + - [899, 6212.04] - - [1024, 768, 1, 512] - - [826, 7401.81] + - [900, 7401.81] - - [2048, 512, 1, 512] - - [823, 8124.66] + - [897, 8124.66] - - [2048, 200, 1, 2048] - - [825, 6561.9] + - [899, 6561.9] - - [2048, 256, 1, 2048] - - [826, 8224.23] + - [900, 8224.23] - - [512, 768, 1, 512] - - [824, 6469.46] + - [898, 6469.46] - - [512, 200, 1, 1024] - - [827, 3755.74] + - [901, 3755.74] - - [4096, 1024, 1, 1024] - - [810, 9605.95] + - [884, 9605.95] - - [4096, 256, 1, 4096] - - [833, 8961.39] + - [907, 8961.39] - - [1024, 512, 1, 512] - - [826, 7109.09] + - [900, 7109.09] - - [512, 256, 1, 512] - - [834, 4033.08] + - [908, 4033.08] - - [1024, 256, 1, 4096] - - [822, 7326.4] - - - [4096, 512, 1, 4096] - - [814, 9472.07] + - [896, 7326.4] - - [1024, 200, 1, 2048] - - [815, 5530.56] + - [889, 5530.56] - - [2048, 1024, 1, 512] - - [816, 8995.93] + - [890, 8995.93] - - [1024, 1024, 1, 2048] - - [823, 8830.21] + - [897, 8830.21] - - [4096, 256, 1, 1024] - - [823, 8581.8] + - [897, 8581.8] - - [512, 768, 1, 1024] - - [824, 6876.01] + - [898, 6876.01] - - [1024, 512, 1, 4096] - - [820, 8484.15] + - [894, 8484.15] - - [1024, 256, 1, 512] - - [817, 5668.08] + - [891, 5668.08] - - [4096, 200, 1, 4096] - - [830, 7018.69] + - [904, 7018.69] - - [2048, 256, 1, 512] - - [830, 7079.09] + - [904, 7079.09] - - [512, 200, 1, 2048] - - [835, 4283.5] + - [909, 4283.5] - - [1024, 1024, 1, 1024] - - [818, 8565.37] + - [892, 8565.37] - - [2048, 512, 1, 2048] - - [818, 8850.59] + - [892, 8850.59] - - [4096, 1024, 1, 4096] - - [811, 9843.28] + - [885, 9843.28] - - [2048, 1024, 1, 1024] - - [816, 9234.21] + - [890, 9234.21] - - [4096, 384, 1, 2048] - - [858, 8892.62] + - [932, 8892.62] - - [4096, 192, 1, 2048] - - [852, 8024.28] + - [926, 8024.28] - - [289, 160, 64, 768] - - [854, 6783.73] + - [928, 6783.73] - - [1225, 192, 64, 384] - - [841, 9373.93] + - [915, 9373.93] - - [5329, 64, 64, 160] - - [845, 9186.79] + - [919, 9186.79] - - [1225, 64, 64, 288] - - [836, 8492.51] + - [910, 8492.51] - - [1225, 64, 64, 384] - - [840, 8735.86] + - [914, 8735.86] - - [289, 128, 64, 1024] - - [855, 7000.3] + - [929, 7000.3] - - [4096, 320, 1, 1280] - - [860, 8302.36] + - [934, 8302.36] - - [4096, 384, 1, 1536] - - [842, 9052.55] + - [916, 9052.55] - - [4096, 192, 1, 1280] - - [857, 7561.95] + - [931, 7561.95] - - [289, 192, 64, 768] - - [853, 7882.6] + - [927, 7882.6] - - [1225, 48, 64, 256] - - [844, 6620.35] + - [918, 6620.35] - - [289, 192, 64, 1024] - - [851, 7347.09] + - [925, 7347.09] - - [1225, 64, 64, 192] - - [837, 8098.45] + - [911, 8098.45] - - [1225, 96, 64, 384] - - [838, 8303.18] + - [912, 8303.18] - - [1225, 48, 64, 288] - - [846, 6746.87] + - [920, 6746.87] - - [4096, 320, 1, 2048] - - [847, 8384.52] + - [921, 8384.52] - - [4096, 256, 1, 1536] - - [859, 8734.44] + - [933, 8734.44] - - [1225, 48, 64, 192] - - [846, 6516.46] + - [920, 6516.46] - - [4096, 384, 1, 1280] - - [856, 9023.34] + - [930, 9023.34] - - [1225, 64, 64, 256] - - [843, 8319.44] + - [917, 8319.44] - - [4096, 448, 1, 1280] - - [847, 8343.42] + - [921, 8343.42] - - [289, 128, 64, 768] - - [849, 7668.08] + - [923, 7668.08] - - [289, 256, 64, 1024] - - [850, 7535.56] + - [924, 7535.56] - - [4096, 448, 1, 2048] - - [847, 8572.41] + - [921, 8572.41] - - [5329, 80, 64, 64] - - [846, 6492.54] + - [920, 6492.54] - - [1225, 32, 64, 192] - - [839, 6278.64] + - [913, 6278.64] - - [289, 384, 64, 1024] - - [848, 7767.67] + - [922, 7767.67] - - [1024, 3594, 1, 4096] - - [867, 8661.52] + - [941, 8661.52] - - [4096, 3103, 1, 1024] - - [877, 9652.23] + - [951, 9652.23] - - [4096, 3136, 1, 1024] - - [861, 9723.15] + - [935, 9723.15] - - [1024, 3141, 1, 4096] - - [879, 8612.12] + - [953, 8612.12] - - [64, 147, 432, 148] - - [894, 6372.03] + - [968, 6372.03] - - [4096, 3559, 1, 1024] - - [866, 9906.35] + - [940, 9906.35] - - [4096, 3368, 1, 1024] - - [861, 9721.01] + - [935, 9721.01] - - [1024, 3335, 1, 4096] - - [885, 8990.29] + - [959, 8990.29] - - [1024, 3510, 1, 4096] - - [885, 9440.68] + - [959, 9440.68] - - [4096, 3209, 1, 1024] - - [866, 9632.76] + - [940, 9632.76] - - [4096, 3322, 1, 1024] - - [865, 9939.52] + - [939, 9939.52] - - [1024, 3400, 1, 4096] - - [884, 9156.09] + - [958, 9156.09] - - [1024, 3995, 1, 4096] - - [867, 9610.25] + - [941, 9610.25] - - [1024, 3503, 1, 4096] - - [885, 9446.57] + - [959, 9446.57] - - [4096, 3594, 1, 1024] - - [876, 9691.96] + - [950, 9691.96] - - [4096, 3473, 1, 1024] - - [865, 9698.9] + - [939, 9698.9] - - [4096, 3522, 1, 1024] - - [866, 9816.92] + - [940, 9816.92] - - [1024, 3103, 1, 4096] - - [863, 8491.05] + - [937, 8491.05] - - [1024, 3214, 1, 4096] - - [884, 8667.67] + - [958, 8667.67] - - [4096, 3449, 1, 1024] - - [876, 9795.71] + - [950, 9795.71] - - [1024, 3136, 1, 4096] - - [885, 8500.61] + - [959, 8500.61] - - [1024, 3955, 1, 33708] - - [865, 9634.94] + - [939, 9634.94] - - [1024, 3780, 1, 4096] - - [868, 9088.88] + - [942, 9088.88] - - [1024, 3906, 1, 33708] - - [866, 9515.46] + - [940, 9515.46] - - [1024, 3386, 1, 4096] - - [885, 9116.05] + - [959, 9116.05] - - [4096, 3396, 1, 1024] - - [876, 9665.6] + - [950, 9665.6] - - [1024, 3183, 1, 4096] - - [863, 8662.94] + - [937, 8662.94] - - [1024, 3098, 1, 4096] - - [879, 8490.22] + - [953, 8490.22] - - [1024, 3548, 1, 4096] - - [885, 9555.63] + - [959, 9555.63] - - [1024, 3224, 1, 4096] - - [878, 8760.88] + - [952, 8760.88] - - [4096, 3469, 1, 1024] - - [865, 9687.21] + - [939, 9687.21] - - [1024, 3582, 1, 4096] - - [882, 9691.0] + - [956, 9691.0] - - [1024, 2977, 1, 4096] - - [867, 9379.38] + - [941, 9379.38] - - [1024, 3939, 1, 1024] - - [864, 9172.11] + - [938, 9172.11] - - [64, 123, 528, 123] - - [912, 6346.17] + - [986, 6346.17] - - [64, 12, 5040, 12] - - [889, 1536.1] + - [963, 1536.1] - - [4096, 3176, 1, 1024] - - [877, 9712.2] + - [951, 9712.2] - - [1024, 3559, 1, 4096] - - [881, 9579.84] + - [955, 9579.84] - - [1024, 3478, 1, 4096] - - [885, 9373.85] + - [959, 9373.85] - - [4096, 3343, 1, 1024] - - [861, 9638.77] + - [935, 9638.77] - - [4096, 3440, 1, 1024] - - [861, 9853.96] + - [935, 9853.96] - - [1024, 3996, 1, 33708] - - [865, 9733.55] + - [939, 9733.55] - - [1024, 4012, 1, 4096] - - [866, 9636.99] + - [940, 9636.99] - - [1024, 3322, 1, 4096] - - [885, 8945.12] + - [959, 8945.12] - - [1024, 3990, 1, 33708] - - [865, 9720.31] + - [939, 9720.31] - - [1024, 3314, 1, 4096] - - [885, 8944.72] + - [959, 8944.72] - - [4096, 3513, 1, 1024] - - [865, 9794.95] + - [939, 9794.95] - - [1024, 3562, 1, 4096] - - [885, 9597.28] + - [959, 9597.28] - - [1024, 3443, 1, 4096] - - [885, 9279.52] + - [959, 9279.52] - - [1024, 3554, 1, 4096] - - [882, 9552.16] + - [956, 9552.16] - - [1024, 3063, 1, 4096] - - [867, 9622.58] + - [941, 9622.58] - - [64, 111, 576, 112] - - [912, 6274.65] + - [986, 6274.65] - - [4096, 3460, 1, 1024] - - [865, 9665.69] + - [939, 9665.69] - - [1024, 3209, 1, 4096] - - [864, 8708.39] + - [938, 8708.39] - - [1024, 3147, 1, 4096] - - [885, 8492.23] + - [959, 8492.23] - - [4096, 3387, 1, 1024] - - [862, 9761.34] + - [936, 9761.34] - - [4096, 3436, 1, 1024] - - [861, 9815.15] + - [935, 9815.15] - - [1024, 3341, 1, 4096] - - [884, 9005.07] + - [958, 9005.07] - - [1024, 3516, 1, 4096] - - [884, 9471.39] + - [958, 9471.39] - - [4096, 3277, 1, 1024] - - [865, 9807.12] + - [939, 9807.12] - - [1024, 3454, 1, 4096] - - [885, 9301.03] + - [959, 9301.03] - - [1024, 3969, 1, 4096] - - [865, 9539.82] + - [939, 9539.82] - - [1024, 3999, 1, 4096] - - [866, 9607.52] + - [940, 9607.52] - - [1024, 4032, 1, 4096] - - [867, 9693.47] + - [941, 9693.47] - - [4096, 3541, 1, 1024] - - [866, 9866.73] + - [940, 9866.73] - - [4096, 3334, 1, 1024] - - [877, 9614.41] + - [951, 9614.41] - - [1024, 3365, 1, 4096] - - [885, 9058.58] + - [959, 9058.58] - - [1024, 3527, 1, 4096] - - [885, 9510.31] + - [959, 9510.31] - - [1024, 3190, 1, 4096] - - [884, 8627.8] + - [958, 8627.8] - - [4096, 3906, 1, 1024] - - [862, 9817.78] + - [936, 9817.78] - - [1024, 3593, 1, 4096] - - [867, 8663.09] + - [941, 8663.09] - - [1024, 3336, 1, 4096] - - [885, 8991.13] + - [959, 8991.13] - - [4096, 3504, 1, 1024] - - [865, 9769.86] + - [939, 9769.86] - - [4096, 3977, 1, 1024] - - [866, 9742.62] + - [940, 9742.62] - - [1024, 3906, 1, 4096] - - [866, 9386.25] + - [940, 9386.25] - - [4096, 3415, 1, 1024] - - [876, 9802.7] + - [950, 9802.7] - - [1024, 3295, 1, 4096] - - [884, 8879.26] + - [958, 8879.26] - - [4096, 3321, 1, 1024] - - [866, 9931.43] + - [940, 9931.43] - - [1024, 3072, 1, 4096] - - [867, 9671.71] + - [941, 9671.71] - - [1024, 3408, 1, 4096] - - [884, 9182.83] + - [958, 9182.83] - - [1024, 3522, 1, 4096] - - [885, 9484.63] + - [959, 9484.63] - - [4096, 3751, 1, 1024] - - [866, 9778.86] + - [940, 9778.86] - - [4096, 3378, 1, 1024] - - [876, 9692.77] + - [950, 9692.77] - - [64, 77, 816, 77] - - [918, 4850.29] + - [992, 4850.29] - - [1024, 3925, 1, 33708] - - [865, 9560.88] + - [939, 9560.88] - - [1024, 3990, 1, 1024] - - [867, 9272.75] + - [941, 9272.75] - - [1024, 3290, 1, 4096] - - [878, 8905.61] + - [952, 8905.61] - - [4096, 3500, 1, 1024] - - [866, 9761.82] + - [940, 9761.82] - - [4096, 3565, 1, 1024] - - [865, 9919.37] + - [939, 9919.37] - - [1024, 3484, 1, 4096] - - [884, 9376.52] + - [958, 9376.52] - - [4096, 3395, 1, 1024] - - [877, 9788.16] + - [951, 9788.16] - - [64, 92, 688, 92] - - [904, 5606.1] + - [978, 5606.1] - - [1024, 3681, 1, 1024] - - [869, 8690.23] + - [943, 8690.23] - - [64, 159, 400, 159] - - [896, 6518.97] + - [970, 6518.97] - - [1024, 3584, 1, 1024] - - [884, 9365.37] + - [958, 9365.37] - - [4096, 3093, 1, 1024] - - [876, 9623.41] + - [950, 9623.41] - - [1024, 4050, 1, 1024] - - [868, 9354.14] + - [942, 9354.14] - - [1024, 3301, 1, 4096] - - [885, 8889.04] + - [959, 8889.04] - - [1024, 3581, 1, 4096] - - [884, 9673.82] + - [958, 9673.82] - - [4096, 3374, 1, 1024] - - [877, 9707.33] + - [951, 9707.33] - - [1024, 3449, 1, 4096] - - [885, 9270.9] + - [959, 9270.9] - - [4096, 3215, 1, 1024] - - [866, 9645.25] + - [940, 9645.25] - - [4096, 3312, 1, 1024] - - [866, 9888.72] + - [940, 9888.72] - - [4096, 3479, 1, 1024] - - [866, 9698.61] + - [940, 9698.61] - - [4096, 3544, 1, 1024] - - [866, 9875.09] + - [940, 9875.09] - - [1024, 3263, 1, 4096] - - [885, 8787.61] + - [959, 8787.61] - - [4096, 3455, 1, 1024] - - [876, 9845.29] + - [950, 9845.29] - - [1024, 3379, 1, 4096] - - [882, 9100.01] + - [956, 9100.01] - - [1024, 3490, 1, 4096] - - [885, 9397.49] + - [959, 9397.49] - - [1024, 3368, 1, 4096] - - [885, 9079.25] + - [959, 9079.25] - - [4096, 3186, 1, 1024] - - [861, 9750.17] + - [935, 9750.17] - - [1024, 3428, 1, 4096] - - [885, 9232.92] + - [959, 9232.92] - - [64, 85, 752, 84] - - [900, 5342.67] + - [974, 5342.67] - - [4096, 3561, 1, 1024] - - [866, 9914.02] + - [940, 9914.02] - - [4096, 3418, 1, 1024] - - [876, 9765.86] + - [950, 9765.86] - - [1024, 3064, 1, 4096] - - [867, 9621.68] + - [941, 9621.68] - - [4096, 3259, 1, 1024] - - [866, 9765.52] + - [940, 9765.52] - - [4096, 3308, 1, 1024] - - [865, 9900.46] + - [939, 9900.46] - - [1024, 3533, 1, 4096] - - [885, 9520.12] + - [959, 9520.12] - - [1024, 3344, 1, 4096] - - [885, 9014.55] + - [959, 9014.55] - - [1024, 4030, 1, 1024] - - [867, 9354.1] + - [941, 9354.1] - - [4096, 3459, 1, 1024] - - [866, 9656.2] + - [940, 9656.2] - - [1024, 3572, 1, 4096] - - [882, 9640.07] + - [956, 9640.07] - - [1024, 3925, 1, 1024] - - [878, 9173.74] + - [952, 9173.74] - - [4096, 3435, 1, 1024] - - [861, 9778.2] + - [935, 9778.2] - - [1024, 3956, 1, 4096] - - [868, 9498.56] + - [942, 9498.56] - - [1024, 3463, 1, 4096] - - [885, 9332.46] + - [959, 9332.46] - - [4096, 3182, 1, 1024] - - [876, 9826.84] + - [950, 9826.84] - - [4096, 3976, 1, 1024] - - [876, 9741.99] + - [950, 9741.99] - - [1024, 3417, 1, 4096] - - [885, 9208.97] + - [959, 9208.97] - - [1024, 3528, 1, 4096] - - [885, 9509.09] + - [959, 9509.09] - - [4096, 3446, 1, 1024] - - [876, 9816.97] + - [950, 9816.97] - - [64, 122, 528, 123] - - [912, 6325.98] + - [986, 6325.98] - - [1024, 3543, 1, 4096] - - [885, 9538.73] + - [959, 9538.73] - - [4096, 3287, 1, 1024] - - [865, 9846.04] + - [939, 9846.04] - - [1024, 3499, 1, 4096] - - [885, 9428.51] + - [959, 9428.51] - - [1024, 3231, 1, 4096] - - [878, 8769.91] + - [952, 8769.91] - - [64, 17, 3632, 17] - - [900, 1934.94] + - [974, 1934.94] - - [4096, 3519, 1, 1024] - - [865, 9804.38] + - [939, 9804.38] - - [4096, 3552, 1, 1024] - - [865, 9892.65] + - [939, 9892.65] - - [1024, 3458, 1, 4096] - - [885, 9312.28] + - [959, 9312.28] - - [64, 93, 688, 92] - - [904, 5660.22] + - [978, 5660.22] - - [1024, 3374, 1, 4096] - - [879, 9110.41] + - [953, 9110.41] - - [1024, 3396, 1, 4096] - - [885, 9145.79] + - [959, 9145.79] - - [1024, 2967, 1, 4096] - - [867, 9364.76] + - [941, 9364.76] - - [64, 19, 3264, 19] - - [904, 2142.47] + - [978, 2142.47] - - [4096, 3482, 1, 1024] - - [865, 9714.2] + - [939, 9714.2] - - [64, 32, 1984, 32] - - [915, 3619.91] + - [989, 3619.91] - - [64, 102, 624, 99] - - [906, 5515.33] + - [980, 5515.33] - - [1024, 3226, 1, 4096] - - [864, 8790.47] + - [938, 8790.47] - - [4096, 3377, 1, 1024] - - [862, 9684.08] + - [936, 9684.08] - - [4096, 3426, 1, 1024] - - [877, 9869.94] + - [951, 9869.94] - - [4096, 2935, 1, 1024] - - [877, 9762.11] + - [951, 9762.11] - - [64, 133, 480, 133] - - [916, 5891.32] + - [990, 5891.32] - - [1024, 3439, 1, 4096] - - [885, 9253.99] + - [959, 9253.99] - - [4096, 3267, 1, 1024] - - [865, 9783.9] + - [939, 9783.9] - - [4096, 3499, 1, 1024] - - [866, 9761.11] + - [940, 9761.11] - - [4096, 3356, 1, 1024] - - [877, 9679.44] + - [951, 9679.44] - - [64, 232, 272, 232] - - [920, 7181.03] + - [994, 7181.03] - - [64, 162, 400, 159] - - [880, 6444.63] + - [954, 6444.63] - - [4096, 3939, 1, 1024] - - [876, 9878.0] + - [950, 9878.0] - - [1024, 3526, 1, 4096] - - [885, 9508.1] + - [959, 9508.1] - - [1024, 3859, 1, 33708] - - [866, 9402.13] + - [940, 9402.13] - - [1024, 3385, 1, 4096] - - [884, 9107.28] + - [958, 9107.28] - - [1024, 3496, 1, 4096] - - [885, 9418.0] + - [959, 9418.0] - - [4096, 3141, 1, 1024] - - [877, 9682.54] + - [951, 9682.54] - - [4096, 3510, 1, 1024] - - [865, 9786.59] + - [939, 9786.59] - - [1024, 3434, 1, 4096] - - [885, 9246.7] + - [959, 9246.7] - - [4096, 3969, 1, 1024] - - [865, 9714.85] + - [939, 9714.85] - - [1024, 3121, 1, 4096] - - [863, 8464.32] + - [937, 8464.32] - - [1024, 3232, 1, 4096] - - [885, 8711.73] + - [959, 8711.73] - - [1024, 4030, 1, 33708] - - [866, 9816.31] + - [940, 9816.31] - - [1024, 3780, 1, 33708] - - [874, 9315.54] + - [948, 9315.54] - - [1024, 3969, 1, 1024] - - [863, 9248.54] + - [937, 9248.54] - - [4096, 3527, 1, 1024] - - [865, 9832.94] + - [939, 9832.94] - - [4096, 3336, 1, 1024] - - [862, 9623.35] + - [936, 9623.35] - - [4096, 3290, 1, 1024] - - [865, 9852.21] + - [939, 9852.21] - - [64, 9, 6544, 9] - - [905, 1068.24] + - [979, 1068.24] - - [1024, 3469, 1, 4096] - - [885, 9350.55] + - [959, 9350.55] - - [4096, 3490, 1, 1024] - - [865, 9737.56] + - [939, 9737.56] - - [4096, 3064, 1, 1024] - - [865, 9890.02] + - [939, 9890.02] - - [4096, 3582, 1, 1024] - - [866, 9961.38] + - [940, 9961.38] - - [1024, 3956, 1, 1024] - - [863, 9294.25] + - [937, 9294.25] - - [4096, 3417, 1, 1024] - - [861, 9811.66] + - [935, 9811.66] - - [1024, 2736, 1, 4096] - - [867, 8636.7] + - [941, 8636.7] - - [64, 78, 816, 78] - - [904, 4946.1] + - [978, 4946.1] - - [1024, 3205, 1, 4096] - - [879, 8657.21] + - [953, 8657.21] - - [1024, 3143, 1, 4096] - - [879, 8567.87] + - [953, 8567.87] - - [1024, 4020, 1, 4096] - - [867, 9664.62] + - [941, 9664.62] - - [1024, 3318, 1, 4096] - - [864, 8967.05] + - [938, 8967.05] - - [4096, 3364, 1, 1024] - - [877, 9697.18] + - [951, 9697.18] - - [1024, 3353, 1, 4096] - - [885, 9034.17] + - [959, 9034.17] - - [1024, 3464, 1, 4096] - - [885, 9326.05] + - [959, 9326.05] - - [4096, 3205, 1, 1024] - - [865, 9619.1] + - [939, 9619.1] - - [4096, 3318, 1, 1024] - - [866, 9932.66] + - [940, 9932.66] - - [1024, 3402, 1, 4096] - - [884, 9153.49] + - [958, 9153.49] - - [4096, 3181, 1, 1024] - - [876, 9789.15] + - [950, 9789.15] - - [4096, 3550, 1, 1024] - - [866, 9888.13] + - [940, 9888.13] - - [4096, 3445, 1, 1024] - - [876, 9752.65] + - [950, 9752.65] - - [1024, 3138, 1, 4096] - - [862, 8484.1] + - [936, 8484.1] - - [64, 99, 624, 99] - - [912, 5323.99] + - [986, 5323.99] - - [4096, 3079, 1, 1024] - - [862, 9562.26] + - [936, 9562.26] - - [4096, 3144, 1, 1024] - - [876, 9686.66] + - [950, 9686.66] - - [4096, 3860, 1, 1024] - - [877, 9733.42] + - [951, 9733.42] - - [1024, 3515, 1, 4096] - - [885, 9478.44] + - [959, 9478.44] - - [4096, 3408, 1, 1024] - - [862, 9764.96] + - [936, 9764.96] - - [64, 101, 624, 102] - - [912, 5482.79] + - [986, 5482.79] - - [1024, 3181, 1, 4096] - - [864, 8593.26] + - [938, 8593.26] - - [4096, 3298, 1, 1024] - - [866, 9867.72] + - [940, 9867.72] - - [4096, 3585, 1, 1024] - - [876, 9633.01] + - [950, 9633.01] - - [1024, 3550, 1, 4096] - - [885, 9564.46] + - [959, 9564.46] - - [1024, 4020, 1, 1024] - - [868, 9339.15] + - [942, 9339.15] - - [4096, 3481, 1, 1024] - - [866, 9714.0] + - [940, 9714.0] - - [4096, 3530, 1, 1024] - - [866, 9833.99] + - [940, 9833.99] - - [4096, 3425, 1, 1024] - - [862, 9675.66] + - [936, 9675.66] - - [4096, 4026, 1, 1024] - - [866, 9849.77] + - [940, 9849.77] - - [1024, 3860, 1, 1024] - - [879, 9073.59] + - [953, 9073.59] - - [4096, 3975, 1, 1024] - - [866, 9737.72] + - [940, 9737.72] - - [1024, 3286, 1, 4096] - - [863, 8884.24] + - [937, 8884.24] - - [1024, 3176, 1, 4096] - - [863, 8597.48] + - [937, 8597.48] - - [1024, 3894, 1, 4096] - - [867, 9359.13] + - [941, 9359.13] - - [4096, 3355, 1, 1024] - - [876, 9693.09] + - [950, 9693.09] - - [4096, 3404, 1, 1024] - - [876, 9786.12] + - [950, 9786.12] - - [1024, 3501, 1, 4096] - - [884, 9426.14] + - [958, 9426.14] - - [4096, 3245, 1, 1024] - - [866, 9723.57] + - [940, 9723.57] - - [1024, 3431, 1, 4096] - - [882, 9244.32] + - [956, 9244.32] - - [1024, 4000, 1, 1024] - - [878, 9344.03] + - [952, 9344.03] - - [4096, 3509, 1, 1024] - - [865, 9781.72] + - [939, 9781.72] - - [4096, 3558, 1, 1024] - - [866, 9905.15] + - [940, 9905.15] - - [1024, 3535, 1, 4096] - - [884, 9519.15] + - [958, 9519.15] - - [1024, 3414, 1, 4096] - - [882, 9198.05] + - [956, 9198.05] - - [1024, 3445, 1, 4096] - - [885, 9279.66] + - [959, 9279.66] - - [1024, 3436, 1, 4096] - - [885, 9259.7] + - [959, 9259.7] - - [4096, 3472, 1, 1024] - - [866, 9685.27] + - [940, 9685.27] - - [1024, 3211, 1, 4096] - - [864, 8708.41] + - [938, 8708.41] - - [64, 7, 8192, 7] - - [901, 802.916] + - [975, 802.916] - - [4096, 3383, 1, 1024] - - [876, 9734.82] + - [950, 9734.82] - - [4096, 3448, 1, 1024] - - [877, 9828.54] + - [951, 9828.54] - - [1024, 3343, 1, 4096] - - [878, 9010.46] + - [952, 9010.46] - - [1024, 3518, 1, 4096] - - [885, 9468.02] + - [959, 9468.02] - - [4096, 3289, 1, 1024] - - [866, 9844.16] + - [940, 9844.16] - - [1024, 3440, 1, 4096] - - [881, 9269.52] + - [955, 9269.52] - - [1024, 4032, 1, 33708] - - [865, 9822.41] + - [939, 9822.41] - - [4096, 3489, 1, 1024] - - [865, 9742.03] + - [939, 9742.03] - - [4096, 3346, 1, 1024] - - [862, 9616.74] + - [936, 9616.74] - - [1024, 3534, 1, 4096] - - [884, 9524.29] + - [958, 9524.29] - - [1024, 3079, 1, 4096] - - [879, 8397.77] + - [953, 8397.77] - - [1024, 3955, 1, 4096] - - [866, 9492.25] + - [940, 9492.25] - - [4096, 3236, 1, 1024] - - [866, 9706.03] + - [940, 9706.03] - - [1024, 3545, 1, 4096] - - [884, 9551.97] + - [958, 9551.97] - - [1024, 3144, 1, 4096] - - [878, 8556.8] + - [952, 8556.8] - - [4096, 3780, 1, 1024] - - [865, 9847.6] + - [939, 9847.6] - - [4096, 3163, 1, 1024] - - [876, 9717.79] + - [950, 9717.79] - - [4096, 3468, 1, 1024] - - [866, 9686.49] + - [940, 9686.49] - - [1024, 3539, 1, 4096] - - [885, 9526.99] + - [959, 9526.99] - - [1024, 3541, 1, 4096] - - [885, 9532.86] + - [959, 9532.86] - - [4096, 3363, 1, 1024] - - [861, 9699.1] + - [935, 9699.1] - - [1024, 3475, 1, 4096] - - [885, 9357.1] + - [959, 9357.1] - - [4096, 3110, 1, 1024] - - [877, 9659.68] + - [951, 9659.68] - - [1024, 3509, 1, 4096] - - [884, 9450.59] + - [958, 9450.59] - - [1024, 3413, 1, 4096] - - [885, 9185.91] + - [959, 9185.91] - - [1024, 3975, 1, 1024] - - [863, 9315.52] + - [937, 9315.52] - - [4096, 3549, 1, 1024] - - [866, 9884.82] + - [940, 9884.82] - - [4096, 3342, 1, 1024] - - [876, 9644.37] + - [950, 9644.37] - - [1024, 2985, 1, 4096] - - [866, 9392.17] + - [940, 9392.17] - - [1024, 3876, 1, 33708] - - [865, 9442.32] + - [939, 9442.32] - - [4096, 3280, 1, 1024] - - [865, 9820.02] + - [939, 9820.02] - - [4096, 3191, 1, 1024] - - [877, 9862.18] + - [951, 9862.18] - - [4096, 3512, 1, 1024] - - [866, 9793.21] + - [940, 9793.21] - - [1024, 3560, 1, 4096] - - [882, 9555.55] + - [956, 9555.55] - - [4096, 2499, 1, 1024] - - [866, 9669.45] + - [940, 9669.45] - - [1024, 3248, 1, 4096] - - [863, 8811.94] + - [937, 8811.94] - - [4096, 3423, 1, 1024] - - [877, 9729.77] + - [951, 9729.77] - - [64, 111, 576, 111] - - [912, 5982.73] + - [986, 5982.73] - - [4096, 3297, 1, 1024] - - [865, 9865.29] + - [939, 9865.29] - - [4096, 3154, 1, 1024] - - [877, 9613.52] + - [951, 9613.52] - - [1024, 3303, 1, 4096] - - [864, 8951.89] + - [938, 8951.89] - - [1024, 3222, 1, 4096] - - [884, 8682.99] + - [958, 8682.99] - - [1024, 3978, 1, 1024] - - [868, 9235.03] + - [942, 9235.03] - - [4096, 3529, 1, 1024] - - [866, 9831.72] + - [940, 9831.72] - - [4096, 3386, 1, 1024] - - [876, 9755.77] + - [950, 9755.77] - - [64, 134, 480, 134] - - [891, 5990.63] + - [965, 5990.63] - - [1024, 3451, 1, 4096] - - [882, 9277.71] + - [956, 9277.71] - - [4096, 3562, 1, 1024] - - [866, 9908.92] + - [940, 9908.92] - - [4096, 3276, 1, 1024] - - [865, 9818.14] + - [939, 9818.14] - - [64, 135, 480, 132] - - [920, 6071.87] + - [994, 6071.87] - - [1024, 3894, 1, 33708] - - [865, 9487.89] + - [939, 9487.89] - - [64, 134, 480, 132] - - [919, 6091.75] + - [993, 6091.75] - - [4096, 3540, 1, 1024] - - [866, 9862.89] + - [940, 9862.89] - - [1024, 3416, 1, 4096] - - [884, 9206.27] + - [958, 9206.27] - - [1024, 4005, 1, 33708] - - [865, 9757.29] + - [939, 9757.29] - - [1024, 3942, 1, 4096] - - [868, 9455.85] + - [942, 9455.85] - - [4096, 3403, 1, 1024] - - [876, 9739.46] + - [950, 9739.46] - - [4096, 3381, 1, 1024] - - [877, 9760.14] + - [951, 9760.14] - - [1024, 3492, 1, 4096] - - [881, 9391.79] + - [955, 9391.79] - - [4096, 3101, 1, 1024] - - [877, 9626.02] + - [951, 9626.02] - - [1024, 3430, 1, 4096] - - [885, 9232.14] + - [959, 9232.14] - - [1024, 3977, 1, 4096] - - [868, 9563.0] + - [942, 9563.0] - - [1024, 3640, 1, 4096] - - [867, 8761.5] + - [941, 8761.5] - - [4096, 3557, 1, 1024] - - [866, 9905.52] + - [940, 9905.52] - - [4096, 3414, 1, 1024] - - [862, 9755.49] + - [936, 9755.49] - - [1024, 3391, 1, 4096] - - [885, 9142.66] + - [959, 9142.66] - - [64, 134, 480, 135] - - [894, 5922.15] + - [968, 5922.15] - - [64, 16, 3840, 16] - - [910, 2080.61] + - [984, 2080.61] - - [1024, 3356, 1, 4096] - - [885, 9051.09] + - [959, 9051.09] - - [4096, 3320, 1, 1024] - - [866, 9929.57] + - [940, 9929.57] - - [4096, 2765, 1, 1024] - - [866, 9750.28] + - [940, 9750.28] - - [64, 162, 400, 162] - - [883, 6515.29] + - [957, 6515.29] - - [1024, 3411, 1, 4096] - - [885, 9185.72] + - [959, 9185.72] - - [1024, 3978, 1, 4096] - - [865, 9562.77] + - [939, 9562.77] - - [4096, 3487, 1, 1024] - - [866, 9733.85] + - [940, 9733.85] - - [4096, 3520, 1, 1024] - - [865, 9813.95] + - [939, 9813.95] - - [4096, 3942, 1, 1024] - - [876, 9804.39] + - [950, 9804.39] - - [4096, 3431, 1, 1024] - - [861, 9819.06] + - [935, 9819.06] - - [1024, 3271, 1, 4096] - - [878, 8913.08] + - [952, 8913.08] - - [4096, 4020, 1, 1024] - - [865, 9831.42] + - [939, 9831.42] - - [1024, 3481, 1, 4096] - - [881, 9376.15] + - [955, 9376.15] - - [1024, 3419, 1, 4096] - - [884, 9208.68] + - [958, 9208.68] - - [1024, 4059, 1, 4096] - - [868, 9733.83] + - [942, 9733.83] - - [4096, 3345, 1, 1024] - - [877, 9651.43] + - [951, 9651.43] - - [4096, 3394, 1, 1024] - - [877, 9780.43] + - [951, 9780.43] - - [1024, 3298, 1, 4096] - - [884, 8889.63] + - [958, 8889.63] - - [4096, 3235, 1, 1024] - - [866, 9705.81] + - [940, 9705.81] - - [1024, 3681, 1, 33708] - - [873, 9146.22] + - [947, 9146.22] - - [1024, 3840, 1, 4096] - - [866, 9253.95] + - [940, 9253.95] - - [1024, 3362, 1, 4096] - - [885, 9059.81] + - [959, 9059.81] - - [4096, 3467, 1, 1024] - - [865, 9677.51] + - [939, 9677.51] - - [1024, 3349, 1, 4096] - - [885, 9034.07] + - [959, 9034.07] - - [1024, 3460, 1, 4096] - - [885, 9322.94] + - [959, 9322.94] - - [4096, 3214, 1, 1024] - - [866, 9644.46] + - [940, 9644.46] - - [1024, 3398, 1, 4096] - - [885, 9157.29] + - [959, 9157.29] - - [4096, 3478, 1, 1024] - - [865, 9706.66] + - [939, 9706.66] - - [1024, 4050, 1, 33708] - - [865, 9865.14] + - [939, 9865.14] - - [1024, 3244, 1, 4096] - - [881, 8744.53] + - [955, 8744.53] - - [4096, 3341, 1, 1024] - - [877, 9646.79] + - [951, 9646.79] - - [4096, 3454, 1, 1024] - - [862, 9880.56] + - [936, 9880.56] - - [1024, 3166, 1, 4096] - - [879, 8618.46] + - [953, 8618.46] - - [1024, 3425, 1, 4096] - - [885, 9225.32] + - [959, 9225.32] - - [4096, 3295, 1, 1024] - - [866, 9863.81] + - [940, 9863.81] - - [4096, 3072, 1, 1024] - - [865, 9971.09] + - [939, 9971.09] - - [4096, 3822, 1, 1024] - - [866, 9952.07] + - [940, 9952.07] - - [1024, 3681, 1, 4096] - - [867, 8856.94] + - [941, 8856.94] - - [1024, 4050, 1, 4096] - - [867, 9717.58] + - [941, 9717.58] - - [4096, 3495, 1, 1024] - - [865, 9741.14] + - [939, 9741.14] - - [4096, 3560, 1, 1024] - - [866, 9909.14] + - [940, 9909.14] - - [1024, 3524, 1, 4096] - - [884, 9503.2] + - [958, 9503.2] - - [1024, 3942, 1, 33708] - - [865, 9602.67] + - [939, 9602.67] - - [1024, 3304, 1, 4096] - - [864, 8928.76] + - [938, 8928.76] - - [1024, 3387, 1, 4096] - - [885, 9127.65] + - [959, 9127.65] - - [1024, 3498, 1, 4096] - - [884, 9423.39] + - [958, 9423.39] - - [4096, 3458, 1, 1024] - - [865, 9642.63] + - [939, 9642.63] - - [4096, 2967, 1, 1024] - - [865, 9626.71] + - [939, 9626.71] - - [64, 8, 7280, 8] - - [887, 1032.61] + - [961, 1032.61] - - [4096, 3385, 1, 1024] - - [861, 9735.77] + - [935, 9735.77] - - [4096, 3434, 1, 1024] - - [876, 9808.9] + - [950, 9808.9] - - [1024, 3519, 1, 4096] - - [885, 9484.83] + - [959, 9484.83] - - [1024, 3511, 1, 4096] - - [885, 9456.47] + - [959, 9456.47] - - [1024, 3288, 1, 4096] - - [884, 8864.05] + - [958, 8864.05] - - [1024, 2918, 1, 4096] - - [867, 9170.35] + - [941, 9170.35] - - [4096, 3573, 1, 1024] - - [866, 9945.85] + - [940, 9945.85] - - [1024, 3822, 1, 33708] - - [875, 9331.0] + - [949, 9331.0] - - [64, 102, 624, 102] - - [912, 5531.17] + - [986, 5531.17] - - [4096, 3539, 1, 1024] - - [866, 9855.39] + - [940, 9855.39] - - [4096, 3332, 1, 1024] - - [877, 9648.97] + - [951, 9648.97] - - [4096, 3286, 1, 1024] - - [866, 9846.42] + - [940, 9846.42] - - [1024, 4026, 1, 4096] - - [867, 9675.94] + - [941, 9675.94] - - [1024, 3277, 1, 4096] - - [881, 8836.21] + - [955, 8836.21] - - [1024, 3471, 1, 4096] - - [885, 9346.33] + - [959, 9346.33] - - [4096, 3518, 1, 1024] - - [866, 9804.2] + - [940, 9804.2] - - [1024, 3393, 1, 4096] - - [885, 9148.99] + - [959, 9148.99] - - [4096, 3413, 1, 1024] - - [862, 9785.17] + - [936, 9785.17] - - [4096, 3303, 1, 1024] - - [866, 9884.37] + - [940, 9884.37] - - [1024, 3207, 1, 4096] - - [863, 8714.69] + - [937, 8714.69] - - [1024, 3894, 1, 1024] - - [879, 9181.51] + - [953, 9181.51] - - [1024, 3977, 1, 1024] - - [879, 9240.9] + - [953, 9240.9] - - [64, 135, 480, 133] - - [894, 5923.4] + - [968, 5923.4] - - [4096, 3535, 1, 1024] - - [866, 9839.55] + - [940, 9839.55] - - [4096, 3376, 1, 1024] - - [861, 9712.02] + - [935, 9712.02] - - [1024, 3355, 1, 4096] - - [885, 9043.27] + - [959, 9043.27] - - [64, 27, 2336, 27] - - [913, 2929.9] + - [987, 2929.9] - - [1024, 3466, 1, 4096] - - [885, 9339.1] + - [959, 9339.1] - - [4096, 3266, 1, 1024] - - [866, 9789.29] + - [940, 9789.29] - - [1024, 3404, 1, 4096] - - [885, 9176.76] + - [959, 9176.76] - - [1024, 3999, 1, 1024] - - [878, 9391.91] + - [952, 9391.91] - - [64, 148, 432, 143] - - [891, 6182.92] + - [965, 6182.92] - - [4096, 3498, 1, 1024] - - [865, 9764.56] + - [939, 9764.56] - - [1024, 4032, 1, 1024] - - [863, 9402.03] + - [937, 9402.03] - - [1024, 3410, 1, 4096] - - [884, 9183.5] + - [958, 9183.5] - - [4096, 3393, 1, 1024] - - [877, 9695.49] + - [951, 9695.49] - - [1024, 3140, 1, 4096] - - [878, 8504.86] + - [952, 8504.86] - - [1024, 3910, 1, 33708] - - [865, 9526.06] + - [939, 9526.06] - - [1024, 3334, 1, 4096] - - [884, 8987.59] + - [958, 8987.59] - - [4096, 3140, 1, 1024] - - [877, 9660.71] + - [951, 9660.71] - - [1024, 4005, 1, 4096] - - [868, 9629.88] + - [942, 9629.88] - - [1024, 3579, 1, 4096] - - [884, 9661.45] + - [958, 9661.45] - - [4096, 3372, 1, 1024] - - [877, 9697.32] + - [951, 9697.32] - - [1024, 3245, 1, 4096] - - [878, 8847.76] + - [952, 8847.76] - - [64, 38, 1680, 38] - - [888, 3340.44] + - [962, 3340.44] - - [4096, 3956, 1, 1024] - - [877, 9911.15] + - [951, 9911.15] - - [4096, 3213, 1, 1024] - - [865, 9643.11] + - [939, 9643.11] - - [1024, 3361, 1, 4096] - - [885, 9062.24] + - [959, 9062.24] - - [1024, 3536, 1, 4096] - - [884, 9530.65] + - [958, 9530.65] - - [1024, 3968, 1, 1024] - - [879, 9377.92] + - [953, 9377.92] - - [4096, 3477, 1, 1024] - - [866, 9700.77] + - [940, 9700.77] - - [4096, 3526, 1, 1024] - - [866, 9824.41] + - [940, 9824.41] - - [1024, 4005, 1, 1024] - - [863, 9362.39] + - [937, 9362.39] - - [1024, 3530, 1, 4096] - - [882, 9487.17] + - [956, 9487.17] - - [1024, 3944, 1, 4096] - - [867, 9464.55] + - [941, 9464.55] - - [4096, 3453, 1, 1024] - - [876, 9826.77] + - [950, 9826.77] - - [4096, 3184, 1, 1024] - - [877, 9833.59] + - [951, 9833.59] - - [4096, 3579, 1, 1024] - - [866, 9962.55] + - [940, 9962.55] - - [4096, 3351, 1, 1024] - - [877, 9653.34] + - [951, 9653.34] - - [4096, 3416, 1, 1024] - - [861, 9810.4] + - [935, 9810.4] - - [64, 100, 624, 100] - - [912, 5408.55] + - [986, 5408.55] - - [1024, 3822, 1, 4096] - - [867, 9196.2] + - [941, 9196.2] - - [1024, 3796, 1, 4096] - - [867, 9131.96] + - [941, 9131.96] - - [4096, 3257, 1, 1024] - - [865, 9767.34] + - [939, 9767.34] - - [4096, 3306, 1, 1024] - - [865, 9893.35] + - [939, 9893.35] - - [1024, 3505, 1, 4096] - - [885, 9450.02] + - [959, 9450.02] - - [1024, 3315, 1, 4096] - - [878, 8979.77] + - [952, 8979.77] - - [1024, 3486, 1, 4096] - - [884, 9393.48] + - [958, 9393.48] - - [4096, 3457, 1, 1024] - - [865, 9653.19] + - [939, 9653.19] - - [4096, 3870, 1, 1024] - - [862, 9717.51] + - [936, 9717.51] - - [1024, 3447, 1, 4096] - - [885, 9273.14] + - [959, 9273.14] - - [1024, 3558, 1, 4096] - - [882, 9567.33] + - [956, 9567.33] - - [4096, 3433, 1, 1024] - - [862, 9759.26] + - [936, 9759.26] - - [4096, 3180, 1, 1024] - - [877, 9738.63] + - [951, 9738.63] - - [1024, 3213, 1, 4096] - - [863, 8692.25] + - [937, 8692.25] - - [1024, 3900, 1, 4096] - - [867, 9388.61] + - [941, 9388.61] - - [4096, 3444, 1, 1024] - - [876, 9869.73] + - [950, 9869.73] - - [1024, 3504, 1, 4096] - - [885, 9429.38] + - [959, 9429.38] - - [4096, 4059, 1, 1024] - - [866, 9920.79] + - [940, 9920.79] - - [1024, 3442, 1, 4096] - - [885, 9273.01] + - [959, 9273.01] - - [4096, 3517, 1, 1024] - - [865, 9808.19] + - [939, 9808.19] - - [1024, 3566, 1, 4096] - - [884, 9622.89] + - [958, 9622.89] - - [4096, 3248, 1, 1024] - - [865, 9730.33] + - [939, 9730.33] - - [1024, 3547, 1, 4096] - - [884, 9564.73] + - [958, 9564.73] - - [64, 59, 1088, 59] - - [903, 4611.76] + - [977, 4611.76] - - [1024, 3340, 1, 4096] - - [884, 8992.21] + - [958, 8992.21] - - [4096, 3480, 1, 1024] - - [866, 9710.17] + - [940, 9710.17] - - [1024, 3968, 1, 4096] - - [866, 9543.11] + - [940, 9543.11] - - [4096, 3424, 1, 1024] - - [862, 9808.66] + - [936, 9808.66] - - [1024, 3906, 1, 1024] - - [864, 9150.54] + - [938, 9150.54] - - [4096, 3265, 1, 1024] - - [865, 9786.85] + - [939, 9786.85] - - [1024, 3384, 1, 4096] - - [885, 9119.56] + - [959, 9119.56] - - [1024, 3494, 1, 4096] - - [882, 9415.52] + - [956, 9415.52] - - [1024, 3236, 1, 4096] - - [879, 8767.14] + - [953, 8767.14] - - [4096, 3497, 1, 1024] - - [866, 9750.86] + - [940, 9750.86] - - [4096, 3354, 1, 1024] - - [877, 9665.17] + - [951, 9665.17] - - [4096, 3055, 1, 1024] - - [866, 9884.09] + - [940, 9884.09] - - [64, 11, 5456, 11] - - [889, 1368.34] + - [963, 1368.34] - - [4096, 3244, 1, 1024] - - [865, 9720.02] + - [939, 9720.02] - - [4096, 3139, 1, 1024] - - [876, 9737.06] + - [950, 9737.06] - - [4096, 3508, 1, 1024] - - [865, 9771.66] + - [939, 9771.66] - - [4096, 4050, 1, 1024] - - [865, 9898.79] + - [939, 9898.79] - - [1024, 3472, 1, 4096] - - [884, 9353.83] + - [958, 9353.83] - - [1024, 3861, 1, 1024] - - [863, 9061.32] + - [937, 9061.32] - - [1024, 3910, 1, 1024] - - [867, 9043.54] + - [941, 9043.54] - - [4096, 3371, 1, 1024] - - [877, 9738.24] + - [951, 9738.24] - - [64, 65, 992, 65] - - [916, 4354.59] + - [990, 4354.59] - - [1024, 3751, 1, 4096] - - [866, 9018.74] + - [940, 9018.74] - - [4096, 3325, 1, 1024] - - [865, 9958.73] + - [939, 9958.73] - - [1024, 3321, 1, 4096] - - [885, 8952.55] + - [959, 8952.55] - - [1024, 3944, 1, 1024] - - [864, 9117.35] + - [938, 9117.35] - - [4096, 3525, 1, 1024] - - [866, 9822.14] + - [940, 9822.14] - - [4096, 3382, 1, 1024] - - [877, 9720.21] + - [951, 9720.21] - - [64, 122, 528, 122] - - [912, 6389.33] + - [986, 6389.33] - - [1024, 3453, 1, 4096] - - [882, 9305.03] + - [956, 9305.03] - - [4096, 3564, 1, 1024] - - [865, 9911.32] + - [939, 9911.32] - - [4096, 3288, 1, 1024] - - [865, 9841.17] + - [939, 9841.17] - - [1024, 3925, 1, 4096] - - [866, 9418.95] + - [940, 9418.95] - - [1024, 3057, 1, 4096] - - [867, 9590.51] + - [941, 9590.51] - - [4096, 3488, 1, 1024] - - [866, 9732.5] + - [940, 9732.5] - - [4096, 3046, 1, 1024] - - [866, 9850.72] + - [940, 9850.72] - - [1024, 3189, 1, 4096] - - [878, 8677.02] + - [952, 8677.02] - - [4096, 3399, 1, 1024] - - [862, 9673.09] + - [936, 9673.09] - - [1024, 3383, 1, 4096] - - [885, 9102.37] + - [959, 9102.37] - - [1024, 3415, 1, 4096] - - [885, 9216.37] + - [959, 9216.37] - - [1024, 3388, 1, 4096] - - [885, 9127.53] + - [959, 9127.53] - - [1024, 3376, 1, 4096] - - [882, 9090.53] + - [956, 9090.53] - - [1024, 3473, 1, 4096] - - [885, 9354.12] + - [959, 9354.12] - - [4096, 3162, 1, 1024] - - [861, 9694.83] + - [935, 9694.83] - - [1024, 3448, 1, 4096] - - [885, 9283.45] + - [959, 9283.45] - - [4096, 3362, 1, 1024] - - [877, 9673.33] + - [951, 9673.33] - - [64, 228, 272, 228] - - [870, 7039.13] + - [944, 7039.13] - - [1024, 3262, 1, 4096] - - [879, 8850.84] + - [953, 8850.84] - - [1024, 3184, 1, 4096] - - [864, 8625.37] + - [938, 8625.37] - - [1024, 3378, 1, 4096] - - [884, 9105.27] + - [958, 9105.27] - - [4096, 3548, 1, 1024] - - [865, 9877.83] + - [939, 9877.83] - - [4096, 2977, 1, 1024] - - [865, 9647.81] + - [939, 9647.81] - - [64, 21, 2976, 21] - - [900, 2364.81] + - [974, 2364.81] - - [64, 112, 576, 111] - - [899, 5973.68] + - [973, 5973.68] - - [4096, 3443, 1, 1024] - - [861, 9784.5] + - [935, 9784.5] - - [1024, 3289, 1, 4096] - - [885, 8874.04] + - [959, 8874.04] - - [1024, 3483, 1, 4096] - - [881, 9380.57] + - [955, 9380.57] - - [4096, 3190, 1, 1024] - - [877, 9850.96] + - [951, 9850.96] - - [1024, 3421, 1, 4096] - - [885, 9214.06] + - [959, 9214.06] - - [1024, 3514, 1, 4096] - - [884, 9458.23] + - [958, 9458.23] - - [1024, 3532, 1, 4096] - - [885, 9513.03] + - [959, 9513.03] - - [1024, 3565, 1, 4096] - - [884, 9630.6] + - [958, 9630.6] - - [4096, 3422, 1, 1024] - - [862, 9733.79] + - [936, 9733.79] - - [4096, 3263, 1, 1024] - - [866, 9776.94] + - [940, 9776.94] - - [4096, 3296, 1, 1024] - - [866, 9860.61] + - [940, 9860.61] - - [4096, 3640, 1, 1024] - - [876, 9782.3] + - [950, 9782.3] - - [4096, 3463, 1, 1024] - - [865, 9672.0] + - [939, 9672.0] - - [4096, 3528, 1, 1024] - - [866, 9829.98] + - [940, 9829.98] - - [1024, 3351, 1, 4096] - - [879, 9054.37] + - [953, 9054.37] - - [1024, 3462, 1, 4096] - - [885, 9327.85] + - [959, 9327.85] - - [4096, 3226, 1, 1024] - - [866, 9674.93] + - [940, 9674.93] - - [4096, 3439, 1, 1024] - - [861, 9823.18] + - [935, 9823.18] - - [4096, 3121, 1, 1024] - - [861, 9672.64] + - [935, 9672.64] - - [1024, 4059, 1, 33708] - - [865, 9885.72] + - [939, 9885.72] - - [1024, 3311, 1, 4096] - - [885, 8910.01] + - [959, 8910.01] - - [1024, 3230, 1, 4096] - - [885, 8705.9] + - [959, 8705.9] - - [4096, 3353, 1, 1024] - - [877, 9671.86] + - [951, 9671.86] - - [4096, 3402, 1, 1024] - - [862, 9727.04] + - [936, 9727.04] - - [1024, 3427, 1, 4096] - - [885, 9233.55] + - [959, 9233.55] - - [1024, 3346, 1, 4096] - - [885, 9015.77] + - [959, 9015.77] - - [1024, 3126, 1, 4096] - - [879, 8519.31] + - [953, 8519.31] - - [1024, 3796, 1, 1024] - - [863, 8916.75] + - [937, 8916.75] - - [1024, 3990, 1, 4096] - - [867, 9600.86] + - [941, 9600.86] - - [1024, 3257, 1, 4096] - - [863, 8790.42] + - [937, 8790.42] - - [4096, 3996, 1, 1024] - - [866, 9788.25] + - [940, 9788.25] - - [64, 143, 432, 143] - - [894, 6087.24] + - [968, 6087.24] - - [1024, 3306, 1, 4096] - - [878, 9035.69] + - [952, 9035.69] - - [1024, 3389, 1, 4096] - - [885, 9134.92] + - [959, 9134.92] - - [1024, 3500, 1, 4096] - - [885, 9443.33] + - [959, 9443.33] - - [1024, 3999, 1, 33708] - - [866, 9741.24] + - [940, 9741.24] - - [4096, 3486, 1, 1024] - - [866, 9719.67] + - [940, 9719.67] - - [1024, 3438, 1, 4096] - - [885, 9259.38] + - [959, 9259.38] - - [4096, 3616, 1, 1024] - - [876, 9739.77] + - [950, 9739.77] - - [1024, 3955, 1, 1024] - - [878, 9260.37] + - [952, 9260.37] - - [4096, 3430, 1, 1024] - - [877, 9819.95] + - [951, 9819.95] - - [4096, 3271, 1, 1024] - - [866, 9802.04] + - [940, 9802.04] - - [1024, 3364, 1, 4096] - - [878, 9144.63] + - [952, 9144.63] - - [64, 54, 1184, 54] - - [898, 4315.78] + - [972, 4315.78] - - [1024, 3497, 1, 4096] - - [885, 9429.42] + - [959, 9429.42] - - [4096, 3503, 1, 1024] - - [865, 9764.48] + - [939, 9764.48] - - [4096, 3344, 1, 1024] - - [862, 9614.16] + - [936, 9614.16] - - [1024, 3457, 1, 4096] - - [885, 9320.6] + - [959, 9320.6] - - [4096, 3466, 1, 1024] - - [865, 9677.81] + - [939, 9677.81] - - [1024, 3976, 1, 33708] - - [866, 9685.38] + - [940, 9685.38] - - [1024, 3395, 1, 4096] - - [884, 9146.39] + - [958, 9146.39] - - [4096, 3361, 1, 1024] - - [876, 9677.89] + - [950, 9677.89] - - [1024, 3751, 1, 33708] - - [874, 9234.69] + - [948, 9234.69] - - [1024, 3822, 1, 1024] - - [863, 8977.83] + - [937, 8977.83] - - [4096, 3315, 1, 1024] - - [866, 9922.54] + - [940, 9922.54] - - [1024, 3163, 1, 4096] - - [878, 8577.79] + - [952, 8577.79] - - [4096, 3547, 1, 1024] - - [866, 9882.92] + - [940, 9882.92] - - [4096, 3340, 1, 1024] - - [876, 9635.42] + - [950, 9635.42] - - [1024, 3296, 1, 4096] - - [885, 8874.66] + - [959, 8874.66] - - [1024, 3468, 1, 4096] - - [885, 9350.26] + - [959, 9350.26] - - [4096, 3294, 1, 1024] - - [865, 9856.87] + - [939, 9856.87] - - [1024, 3406, 1, 4096] - - [881, 9162.84] + - [955, 9162.84] - - [1024, 3860, 1, 33708] - - [865, 9403.56] + - [939, 9403.56] - - [1024, 3584, 1, 4096] - - [882, 9677.44] + - [956, 9677.44] - - [4096, 3189, 1, 1024] - - [877, 9820.69] + - [951, 9820.69] - - [4096, 3494, 1, 1024] - - [865, 9747.68] + - [939, 9747.68] - - [64, 135, 480, 135] - - [891, 5966.34] + - [965, 5966.34] - - [1024, 3093, 1, 4096] - - [879, 8446.06] + - [953, 8446.06] - - [4096, 3421, 1, 1024] - - [862, 9776.03] + - [936, 9776.03] - - [1024, 3479, 1, 4096] - - [885, 9376.54] + - [959, 9376.54] - - [1024, 3433, 1, 4096] - - [885, 9251.14] + - [959, 9251.14] - - [4096, 3311, 1, 1024] - - [865, 9901.53] + - [939, 9901.53] - - [1024, 3381, 1, 4096] - - [885, 9103.99] + - [959, 9103.99] - - [1024, 3996, 1, 4096] - - [866, 9609.56] + - [940, 9609.56] - - [4096, 3384, 1, 1024] - - [876, 9750.01] + - [950, 9750.01] - - [1024, 3247, 1, 4096] - - [864, 8872.59] + - [938, 8872.59] - - [1024, 3169, 1, 4096] - - [863, 8597.61] + - [937, 8597.61] - - [1024, 3088, 1, 4096] - - [879, 8410.07] + - [953, 8410.07] - - [1024, 3363, 1, 4096] - - [885, 9069.5] + - [959, 9069.5] - - [1024, 3538, 1, 4096] - - [884, 9529.68] + - [958, 9529.68] - - [1024, 3996, 1, 1024] - - [868, 9323.06] + - [942, 9323.06] - - [4096, 3169, 1, 1024] - - [862, 9821.4] + - [936, 9821.4] - - [4096, 3538, 1, 1024] - - [865, 9859.42] + - [939, 9859.42] - - [4096, 3401, 1, 1024] - - [862, 9754.5] + - [936, 9754.5] - - [4096, 3581, 1, 1024] - - [865, 9960.71] + - [939, 9960.71] - - [1024, 3180, 1, 4096] - - [863, 8635.05] + - [937, 8635.05] - - [1024, 3870, 1, 1024] - - [864, 9085.69] + - [938, 9085.69] - - [4096, 3555, 1, 1024] - - [865, 9905.74] + - [939, 9905.74] - - [4096, 3412, 1, 1024] - - [877, 9778.56] + - [951, 9778.56] - - [4096, 3302, 1, 1024] - - [865, 9888.71] + - [939, 9888.71] - - [1024, 3561, 1, 4096] - - [881, 9597.05] + - [955, 9597.05] - - [1024, 3302, 1, 4096] - - [885, 8900.87] + - [959, 8900.87] - - [1024, 3976, 1, 4096] - - [867, 9563.22] + - [941, 9563.22] - - [4096, 3485, 1, 1024] - - [865, 9722.57] + - [939, 9722.57] - - [4096, 3534, 1, 1024] - - [865, 9847.22] + - [939, 9847.22] - - [1024, 3110, 1, 4096] - - [878, 8458.56] + - [952, 8458.56] - - [1024, 3401, 1, 4096] - - [885, 9174.81] + - [959, 9174.81] - - [4096, 3216, 1, 1024] - - [865, 9645.49] + - [939, 9645.49] - - [1024, 4020, 1, 33708] - - [865, 9793.61] + - [939, 9793.61] - - [1024, 3215, 1, 4096] - - [885, 8677.51] + - [959, 8677.51] - - [4096, 3566, 1, 1024] - - [865, 9924.78] + - [939, 9924.78] - - [1024, 3137, 1, 4096] - - [863, 8547.07] + - [937, 8547.07] - - [4096, 3359, 1, 1024] - - [862, 9673.73] + - [936, 9673.73] - - [4096, 3392, 1, 1024] - - [877, 9757.51] + - [951, 9757.51] - - [1024, 3506, 1, 4096] - - [885, 9443.0] + - [959, 9443.0] - - [4096, 3233, 1, 1024] - - [865, 9698.7] + - [939, 9698.7] - - [1024, 3444, 1, 4096] - - [885, 9275.54] + - [959, 9275.54] - - [1024, 3975, 1, 4096] - - [866, 9556.87] + - [940, 9556.87] - - [1024, 3870, 1, 33708] - - [865, 9427.44] + - [939, 9427.44] - - [4096, 3465, 1, 1024] - - [866, 9675.01] + - [940, 9675.01] - - [4096, 3968, 1, 1024] - - [862, 9927.93] + - [936, 9927.93] - - [1024, 3523, 1, 4096] - - [885, 9494.15] + - [959, 9494.15] - - [64, 10, 5952, 10] - - [889, 1224.16] + - [963, 1224.16] - - [4096, 3990, 1, 1024] - - [865, 9771.27] + - [939, 9771.27] - - [1024, 3549, 1, 4096] - - [884, 9553.42] + - [958, 9553.42] - - [1024, 3342, 1, 4096] - - [885, 9007.31] + - [959, 9007.31] - - [4096, 3476, 1, 1024] - - [865, 9703.66] + - [939, 9703.66] - - [64, 232, 272, 228] - - [871, 7078.93] + - [945, 7078.93] - - [1024, 3418, 1, 4096] - - [885, 9213.09] + - [959, 9213.09] - - [1024, 3859, 1, 1024] - - [864, 9087.54] + - [938, 9087.54] - - [4096, 3339, 1, 1024] - - [877, 9594.0] + - [951, 9594.0] - - [4096, 3452, 1, 1024] - - [862, 9872.69] + - [936, 9872.69] - - [4096, 3293, 1, 1024] - - [865, 9842.65] + - [939, 9842.65] - - [4096, 3840, 1, 1024] - - [866, 10030.8] + - [940, 10030.8] - - [1024, 3369, 1, 4096] - - [863, 9099.72] + - [937, 9099.72] - - [64, 193, 320, 193] - - [893, 6425.8] + - [967, 6425.8] - - [1024, 3544, 1, 4096] - - [882, 9556.64] + - [956, 9556.64] - - [4096, 3493, 1, 1024] - - [866, 9743.34] + - [940, 9743.34] - - [4096, 3350, 1, 1024] - - [877, 9653.11] + - [951, 9653.11] - - [64, 71, 896, 71] - - [917, 4686.73] + - [991, 4686.73] - - [4096, 3256, 1, 1024] - - [865, 9763.78] + - [939, 9763.78] - - [1024, 3870, 1, 4096] - - [867, 9305.28] + - [941, 9305.28] - - [4096, 4012, 1, 1024] - - [866, 9817.35] + - [940, 9817.35] - - [1024, 3280, 1, 4096] - - [885, 8842.02] + - [959, 8842.02] - - [4096, 3456, 1, 1024] - - [861, 9874.43] + - [935, 9874.43] - - [1024, 3555, 1, 4096] - - [884, 9599.63] + - [958, 9599.63] - - [4096, 3014, 1, 1024] - - [865, 9762.28] + - [939, 9762.28] - - [1024, 3474, 1, 4096] - - [885, 9373.67] + - [959, 9373.67] - - [4096, 3367, 1, 1024] - - [861, 9694.64] + - [935, 9694.64] - - [4096, 3432, 1, 1024] - - [877, 9855.27] + - [951, 9855.27] - - [64, 84, 752, 84] - - [904, 5247.18] + - [978, 5247.18] - - [4096, 3273, 1, 1024] - - [866, 9801.87] + - [940, 9801.87] - - [4096, 3130, 1, 1024] - - [862, 9672.52] + - [936, 9672.52] - - [1024, 2984, 1, 4096] - - [867, 9403.7] + - [941, 9403.7] - - [1024, 3995, 1, 1024] - - [879, 9392.61] + - [953, 9392.61] - - [1024, 3517, 1, 4096] - - [885, 9481.39] + - [959, 9481.39] - - [1024, 3455, 1, 4096] - - [885, 9302.29] + - [959, 9302.29] - - [1024, 3939, 1, 4096] - - [867, 9469.89] + - [941, 9469.89] - - [64, 49, 1296, 49] - - [897, 3938.96] + - [971, 3938.96] - - [64, 14, 4368, 14] - - [889, 1802.47] + - [963, 1802.47] - - [64, 25, 2512, 25] - - [908, 2760.54] + - [982, 2760.54] - - [4096, 3147, 1, 1024] - - [877, 9713.03] + - [951, 9713.03] - - [4096, 3516, 1, 1024] - - [865, 9805.93] + - [939, 9805.93] - - [1024, 3876, 1, 4096] - - [867, 9320.56] + - [941, 9320.56] - - [1024, 3191, 1, 4096] - - [864, 8640.76] + - [938, 8640.76] - - [4096, 3411, 1, 1024] - - [876, 9737.37] + - [950, 9737.37] - - [1024, 3337, 1, 4096] - - [885, 8990.13] + - [959, 8990.13] - - [1024, 3512, 1, 4096] - - [885, 9459.65] + - [959, 9459.65] - - [4096, 3301, 1, 1024] - - [865, 9877.26] + - [939, 9877.26] - - [1024, 3450, 1, 4096] - - [884, 9283.11] + - [958, 9283.11] - - [4096, 3533, 1, 1024] - - [865, 9848.62] + - [939, 9848.62] - - [4096, 3390, 1, 1024] - - [877, 9764.61] + - [951, 9764.61] - - [4096, 3231, 1, 1024] - - [865, 9693.81] + - [939, 9693.81] - - [1024, 2499, 1, 4096] - - [884, 9304.81] + - [958, 9304.81] - - [1024, 3186, 1, 4096] - - [864, 8649.55] + - [938, 8649.55] - - [1024, 3380, 1, 4096] - - [885, 9101.77] + - [959, 9101.77] - - [4096, 3496, 1, 1024] - - [866, 9754.3] + - [940, 9754.3] - - [1024, 3956, 1, 33708] - - [865, 9636.77] + - [939, 9636.77] - - [1024, 3976, 1, 1024] - - [867, 9248.41] + - [941, 9248.41] - - [4096, 2736, 1, 1024] - - [865, 9651.91] + - [939, 9651.91] - - [1024, 3291, 1, 4096] - - [885, 8868.94] + - [959, 8868.94] - - [1024, 3944, 1, 33708] - - [866, 9607.0] + - [940, 9607.0] - - [1024, 3485, 1, 4096] - - [884, 9385.96] + - [958, 9385.96] - - [4096, 3138, 1, 1024] - - [862, 9672.15] + - [936, 9672.15] - - [1024, 3423, 1, 4096] - - [885, 9222.77] + - [959, 9222.77] - - [1024, 3491, 1, 4096] - - [885, 9405.02] + - [959, 9405.02] - - [1024, 3860, 1, 4096] - - [868, 9282.94] + - [942, 9282.94] - - [4096, 3211, 1, 1024] - - [865, 9640.42] + - [939, 9640.42] - - [1024, 3221, 1, 4096] - - [879, 8709.4] + - [953, 8709.4] - - [1024, 2917, 1, 4096] - - [867, 9177.11] + - [941, 9177.11] - - [4096, 3475, 1, 1024] - - [865, 9703.45] + - [939, 9703.45] - - [4096, 3524, 1, 1024] - - [865, 9816.23] + - [939, 9816.23] - - [4096, 2985, 1, 1024] - - [866, 9686.91] + - [940, 9686.91] - - [1024, 3480, 1, 4096] - - [885, 9380.2] + - [959, 9380.2] - - [4096, 3222, 1, 1024] - - [865, 9666.8] + - [939, 9666.8] - - [4096, 3451, 1, 1024] - - [861, 9877.91] + - [935, 9877.91] - - [1024, 3969, 1, 33708] - - [865, 9669.64] + - [939, 9669.64] - - [1024, 3640, 1, 1024] - - [872, 8565.68] + - [946, 8565.68] - - [1024, 3297, 1, 4096] - - [881, 8889.22] + - [955, 8889.22] - - [4096, 3944, 1, 1024] - - [862, 9902.85] + - [936, 9902.85] - - [1024, 3216, 1, 4096] - - [864, 8695.88] + - [938, 8695.88] - - [1024, 3840, 1, 1024] - - [878, 9046.05] + - [952, 9046.05] - - [4096, 3349, 1, 1024] - - [876, 9676.82] + - [950, 9676.82] - - [4096, 3398, 1, 1024] - - [862, 9775.84] + - [936, 9775.84] - - [1024, 3154, 1, 4096] - - [879, 8662.26] + - [953, 8662.26] - - [1024, 3978, 1, 33708] - - [866, 9689.16] + - [940, 9689.16] - - [1024, 3348, 1, 4096] - - [885, 9014.67] + - [959, 9014.67] - - [4096, 3304, 1, 1024] - - [866, 9886.8] + - [940, 9886.8] - - [4096, 4030, 1, 1024] - - [866, 9859.1] + - [940, 9859.1] - - [1024, 4026, 1, 1024] - - [863, 9326.64] + - [937, 9326.64] - - [4096, 3471, 1, 1024] - - [865, 9683.0] + - [939, 9683.0] - - [1024, 3259, 1, 4096] - - [879, 8792.19] + - [953, 8792.19] - - [64, 132, 480, 132] - - [919, 6027.86] + - [993, 6027.86] - - [1024, 3308, 1, 4096] - - [884, 8905.14] + - [958, 8905.14] - - [4096, 3391, 1, 1024] - - [877, 9765.35] + - [951, 9765.35] - - [1024, 3312, 1, 4096] - - [885, 8917.74] + - [959, 8917.74] - - [1024, 3502, 1, 4096] - - [885, 9435.62] + - [959, 9435.62] - - [1024, 3968, 1, 33708] - - [865, 9668.24] + - [939, 9668.24] - - [1024, 3424, 1, 4096] - - [881, 9215.99] + - [955, 9215.99] - - [64, 13, 4672, 13] - - [890, 1662.35] + - [964, 1662.35] - - [4096, 4032, 1, 1024] - - [876, 9877.82] + - [950, 9877.82] - - [1024, 3900, 1, 1024] - - [879, 9116.93] + - [953, 9116.93] - - [4096, 3442, 1, 1024] - - [876, 9773.18] + - [950, 9773.18] - - [1024, 3366, 1, 4096] - - [885, 9079.46] + - [959, 9079.46] - - [4096, 3999, 1, 1024] - - [865, 9786.46] + - [939, 9786.46] - - [1024, 3477, 1, 4096] - - [885, 9364.89] + - [959, 9364.89] - - [1024, 2505, 1, 4096] - - [885, 9304.03] + - [959, 9304.03] - - [4096, 3515, 1, 1024] - - [865, 9797.93] + - [939, 9797.93] - - [1024, 3564, 1, 4096] - - [881, 9632.86] + - [955, 9632.86] - - [4096, 3057, 1, 1024] - - [866, 9880.19] + - [940, 9880.19] - - [1024, 3339, 1, 4096] - - [864, 9029.86] + - [938, 9029.86] - - [4096, 3262, 1, 1024] - - [865, 9780.1] + - [939, 9780.1] - - [1024, 4030, 1, 4096] - - [868, 9682.0] + - [942, 9682.0] - - [1024, 3265, 1, 4096] - - [885, 8797.52] + - [959, 8797.52] - - [1024, 3459, 1, 4096] - - [885, 9313.06] + - [959, 9313.06] - - [4096, 3462, 1, 1024] - - [866, 9669.73] + - [940, 9669.73] - - [64, 85, 752, 85] - - [904, 5186.93] + - [978, 5186.93] - - [1024, 3513, 1, 4096] - - [882, 9469.15] + - [956, 9469.15] - - [1024, 3397, 1, 4096] - - [885, 9151.77] + - [959, 9151.77] - - [4096, 3572, 1, 1024] - - [865, 9945.7] + - [939, 9945.7] - - [4096, 3389, 1, 1024] - - [877, 9740.86] + - [951, 9740.86] - - [4096, 3438, 1, 1024] - - [877, 9822.47] + - [951, 9822.47] - - [64, 102, 624, 100] - - [912, 5487.0] + - [986, 5487.0] - - [1024, 3640, 1, 33708] - - [873, 9083.53] + - [947, 9083.53] - - [1024, 3995, 1, 33708] - - [866, 9731.99] + - [940, 9731.99] - - [1024, 3165, 1, 4096] - - [878, 8601.9] + - [952, 8601.9] - - [4096, 3543, 1, 1024] - - [866, 9868.63] + - [940, 9868.63] - - [4096, 3352, 1, 1024] - - [861, 9668.44] + - [935, 9668.44] - - [1024, 3359, 1, 4096] - - [882, 9050.33] + - [956, 9050.33] - - [1024, 3470, 1, 4096] - - [885, 9355.17] + - [959, 9355.17] - - [64, 15, 4096, 15] - - [889, 1945.43] + - [963, 1945.43] - - [1024, 3392, 1, 4096] - - [884, 9139.71] + - [958, 9139.71] - - [64, 78, 816, 77] - - [896, 4870.56] + - [970, 4870.56] - - [4096, 3137, 1, 1024] - - [861, 9600.22] + - [935, 9600.22] - - [4096, 3506, 1, 1024] - - [866, 9779.08] + - [940, 9779.08] - - [1024, 3095, 1, 4096] - - [878, 8381.24] + - [952, 8381.24] - - [1024, 3859, 1, 4096] - - [865, 9288.63] + - [939, 9288.63] - - [4096, 3369, 1, 1024] - - [877, 9697.73] + - [951, 9697.73] - - [64, 45, 1424, 45] - - [914, 3883.74] + - [988, 3883.74] - - [1024, 3435, 1, 4096] - - [885, 9264.62] + - [959, 9264.62] - - [1024, 3354, 1, 4096] - - [885, 9035.47] + - [959, 9035.47] - - [1024, 3055, 1, 4096] - - [866, 9597.45] + - [940, 9597.45] - - [4096, 3523, 1, 1024] - - [865, 9821.79] + - [939, 9821.79] - - [4096, 3380, 1, 1024] - - [861, 9721.39] + - [935, 9721.39] - - [1024, 3233, 1, 4096] - - [878, 8724.75] + - [952, 8724.75] - - [4096, 3221, 1, 1024] - - [865, 9661.04] + - [939, 9661.04] - - [4096, 3270, 1, 1024] - - [865, 9797.92] + - [939, 9797.92] - - [4096, 3593, 1, 1024] - - [876, 9679.31] + - [950, 9679.31] - - [1024, 3358, 1, 4096] - - [885, 9051.82] + - [959, 9051.82] - - [1024, 3540, 1, 4096] - - [885, 9533.59] + - [959, 9533.59] - - [4096, 3502, 1, 1024] - - [866, 9760.65] + - [940, 9760.65] - - [4096, 2505, 1, 1024] - - [866, 9680.52] + - [940, 9680.52] - - [4096, 3397, 1, 1024] - - [876, 9785.85] + - [950, 9785.85] - - [1024, 3300, 1, 4096] - - [879, 8907.85] + - [953, 8907.85] - - [4096, 3095, 1, 1024] - - [862, 9618.78] + - [936, 9618.78] - - [1024, 3182, 1, 4096] - - [878, 8606.16] + - [952, 8606.16] - - [1024, 3299, 1, 4096] - - [884, 8885.48] + - [958, 8885.48] - - [1024, 3276, 1, 4096] - - [879, 8872.75] + - [953, 8872.75] - - [1024, 3360, 1, 4096] - - [882, 9044.2] + - [956, 9044.2] - - [4096, 3360, 1, 1024] - - [877, 9681.39] + - [951, 9681.39] - - [4096, 2918, 1, 1024] - - [861, 9732.74] + - [935, 9732.74] - - [1024, 3939, 1, 33708] - - [865, 9595.96] + - [939, 9595.96] - - [4096, 3314, 1, 1024] - - [866, 9915.02] + - [940, 9915.02] - - [1024, 3319, 1, 4096] - - [885, 8956.37] + - [959, 8956.37] - - [64, 35, 1808, 35] - - [902, 3060.27] + - [976, 3060.27] - - [1024, 3942, 1, 1024] - - [878, 9211.83] + - [952, 9211.83] - - [1024, 3465, 1, 4096] - - [885, 9340.73] + - [959, 9340.73] - - [4096, 3546, 1, 1024] - - [866, 9875.41] + - [940, 9875.41] - - [1024, 3403, 1, 4096] - - [878, 9224.34] + - [952, 9224.34] - - [1024, 3948, 1, 1024] - - [864, 9245.63] + - [938, 9245.63] - - [4096, 3441, 1, 1024] - - [877, 9758.72] + - [951, 9758.72] - - [1024, 3139, 1, 4096] - - [878, 8582.84] + - [952, 8582.84] - - [1024, 3563, 1, 4096] - - [885, 9620.74] + - [959, 9620.74] - - [1024, 3508, 1, 4096] - - [882, 9449.36] + - [956, 9449.36] - - [1024, 3975, 1, 33708] - - [865, 9683.55] + - [939, 9683.55] - - [1024, 3446, 1, 4096] - - [884, 9289.51] + - [958, 9289.51] - - [1024, 3529, 1, 4096] - - [881, 9491.29] + - [955, 9491.29] - - [64, 112, 576, 112] - - [906, 6387.14] + - [980, 6387.14] - - [4096, 3461, 1, 1024] - - [866, 9663.33] + - [940, 9663.33] - - [1024, 3574, 1, 4096] - - [884, 9662.88] + - [958, 9662.88] - - [1024, 3101, 1, 4096] - - [879, 8468.34] + - [953, 8468.34] - - [1024, 3927, 1, 1024] - - [864, 9207.97] + - [938, 9207.97] - - [4096, 3224, 1, 1024] - - [866, 9665.61] + - [940, 9665.61] - - [4096, 3437, 1, 1024] - - [862, 9857.21] + - [936, 9857.21] - - [4096, 3900, 1, 1024] - - [877, 9826.25] + - [951, 9826.25] - - [1024, 3495, 1, 4096] - - [885, 9412.41] + - [959, 9412.41] - - [1024, 3977, 1, 33708] - - [865, 9687.87] + - [939, 9687.87] - - [1024, 3328, 1, 4096] - - [885, 8975.57] + - [959, 8975.57] - - [4096, 3168, 1, 1024] - - [861, 9754.87] + - [935, 9754.87] - - [1024, 4026, 1, 33708] - - [865, 9807.24] + - [939, 9807.24] - - [1024, 3292, 1, 4096] - - [878, 8901.83] + - [952, 8901.83] - - [1024, 3294, 1, 4096] - - [885, 8877.03] + - [959, 8877.03] - - [4096, 3335, 1, 1024] - - [862, 9616.23] + - [936, 9616.23] - - [4096, 3400, 1, 1024] - - [876, 9710.73] + - [950, 9710.73] - - [1024, 3287, 1, 4096] - - [863, 8908.07] + - [937, 8908.07] - - [1024, 3910, 1, 4096] - - [867, 9401.03] + - [941, 9401.03] - - [1024, 3780, 1, 1024] - - [878, 8863.29] + - [952, 8863.29] - - [4096, 3098, 1, 1024] - - [862, 9606.47] + - [936, 9606.47] - - [1024, 3584, 1, 33708] - - [885, 9775.33] + - [959, 9775.33] - - [64, 29, 2176, 29] - - [907, 3135.03] + - [981, 3135.03] - - [1024, 3371, 1, 4096] - - [863, 9117.81] + - [937, 9117.81] - - [1024, 3546, 1, 4096] - - [885, 9547.3] + - [959, 9547.3] - - [1024, 4012, 1, 1024] - - [867, 9353.73] + - [941, 9353.73] - - [4096, 3505, 1, 1024] - - [865, 9773.17] + - [939, 9773.17] - - [4096, 3554, 1, 1024] - - [865, 9895.59] + - [939, 9895.59] - - [4096, 3063, 1, 1024] - - [865, 9898.98] + - [939, 9898.98] - - [1024, 3900, 1, 33708] - - [866, 9502.93] + - [940, 9502.93] - - [1024, 3345, 1, 4096] - - [885, 9015.85] + - [959, 9015.85] - - [1024, 3357, 1, 4096] - - [885, 9041.23] + - [959, 9041.23] - - [1024, 3282, 1, 4096] - - [878, 8860.17] + - [952, 8860.17] - - [4096, 3484, 1, 1024] - - [866, 9721.33] + - [940, 9721.33] - - [1024, 3557, 1, 4096] - - [882, 9573.48] + - [956, 9573.48] - - [1024, 3476, 1, 4096] - - [885, 9361.72] + - [959, 9361.72] - - [1024, 3751, 1, 1024] - - [879, 8849.11] + - [953, 8849.11] - - [4096, 3379, 1, 1024] - - [862, 9741.49] + - [936, 9741.49] - - [4096, 3428, 1, 1024] - - [861, 9767.82] + - [935, 9767.82] - - [4096, 3126, 1, 1024] - - [876, 9701.9] + - [950, 9701.9] - - [64, 41, 1552, 41] - - [911, 3555.69] + - [985, 3555.69] - - [1024, 3325, 1, 4096] - - [863, 8962.41] + - [937, 8962.41] - - [4096, 3501, 1, 1024] - - [865, 9762.01] + - [939, 9762.01] - - [4096, 3358, 1, 1024] - - [861, 9680.42] + - [935, 9680.42] - - [1024, 3441, 1, 4096] - - [885, 9271.27] + - [959, 9271.27] - - [1024, 3552, 1, 4096] - - [881, 9565.42] + - [955, 9565.42] - - [4096, 3232, 1, 1024] - - [866, 9696.81] + - [940, 9696.81] - - [64, 18, 3440, 18] - - [886, 2059.33] + - [960, 2059.33] - - [1024, 3412, 1, 4096] - - [885, 9199.28] + - [959, 9199.28] - - [1024, 3372, 1, 4096] - - [882, 9083.49] + - [956, 9083.49] - - [1024, 3585, 1, 4096] - - [872, 8710.29] + - [946, 8710.29] - - [4096, 3143, 1, 1024] - - [877, 9692.12] + - [951, 9692.12] - - [4096, 3464, 1, 1024] - - [865, 9661.93] + - [939, 9661.93] - - [1024, 3145, 1, 4096] - - [864, 8526.33] + - [938, 8526.33] - - [4096, 3375, 1, 1024] - - [876, 9734.78] + - [950, 9734.78] - - [4096, 2917, 1, 1024] - - [861, 9714.57] + - [935, 9714.57] - - [4096, 3978, 1, 1024] - - [866, 9741.43] + - [940, 9741.43] - - [1024, 2765, 1, 4096] - - [867, 8706.75] + - [941, 8706.75] - - [64, 148, 432, 148] - - [892, 6372.17] + - [966, 6372.17] - - [1024, 3452, 1, 4096] - - [884, 9301.38] + - [958, 9301.38] - - [4096, 3584, 1, 1024] - - [866, 10005.7] + - [940, 10005.7] - - [4096, 3545, 1, 1024] - - [866, 9877.87] + - [940, 9877.87] - - [1024, 3352, 1, 4096] - - [885, 9035.19] + - [959, 9035.19] - - [64, 159, 400, 160] - - [894, 6952.11] + - [968, 6952.11] - - [4096, 3292, 1, 1024] - - [865, 9856.51] + - [939, 9856.51] - - [1024, 3525, 1, 4096] - - [885, 9501.5] + - [959, 9501.5] - - [1024, 3266, 1, 4096] - - [885, 8817.43] + - [959, 8817.43] - - [1024, 3382, 1, 4096] - - [884, 9101.54] + - [958, 9101.54] - - [4096, 3492, 1, 1024] - - [865, 9747.29] + - [939, 9747.29] - - [4096, 3419, 1, 1024] - - [877, 9745.88] + - [951, 9745.88] - - [1024, 3796, 1, 33708] - - [874, 9356.26] + - [948, 9356.26] - - [1024, 3293, 1, 4096] - - [881, 8868.4] + - [955, 8868.4] - - [4096, 3796, 1, 1024] - - [866, 9885.36] + - [940, 9885.36] - - [1024, 3487, 1, 4096] - - [882, 9391.34] + - [956, 9391.34] - - [4096, 3166, 1, 1024] - - [877, 9718.46] + - [951, 9718.46] - - [64, 102, 624, 101] - - [906, 5547.84] + - [980, 5547.84] - - [1024, 3409, 1, 4096] - - [885, 9187.88] + - [959, 9187.88] - - [1024, 3520, 1, 4096] - - [884, 9485.09] + - [958, 9485.09] - - [1024, 3573, 1, 4096] - - [885, 9652.71] + - [959, 9652.71] - - [4096, 3366, 1, 1024] - - [861, 9684.31] + - [935, 9684.31] - - [4096, 3720, 1, 1024] - - [877, 9703.34] + - [951, 9703.34] - - [4096, 3207, 1, 1024] - - [865, 9626.21] + - [939, 9626.21] - - [4096, 3272, 1, 1024] - - [865, 9795.51] + - [939, 9795.51] - - [1024, 3390, 1, 4096] - - [885, 9125.88] + - [959, 9125.88] - - [4096, 3183, 1, 1024] - - [877, 9825.87] + - [951, 9825.87] - - [4096, 3536, 1, 1024] - - [866, 9846.51] + - [940, 9846.51] - - [4096, 3563, 1, 1024] - - [866, 9913.8] + - [940, 9913.8] - - [1024, 3482, 1, 4096] - - [885, 9376.91] + - [959, 9376.91] - - [4096, 3447, 1, 1024] - - [876, 9875.09] + - [950, 9875.09] - - [4096, 3955, 1, 1024] - - [861, 9922.39] + - [935, 9922.39] - - [4096, 4005, 1, 1024] - - [866, 9803.43] + - [940, 9803.43] - - [1024, 3493, 1, 4096] - - [885, 9411.37] + - [959, 9411.37] - - [4096, 3410, 1, 1024] - - [861, 9788.34] + - [935, 9788.34] - - [1024, 3422, 1, 4096] - - [884, 9216.28] + - [958, 9216.28] - - [1024, 3350, 1, 4096] - - [879, 9068.02] + - [953, 9068.02] - - [4096, 3300, 1, 1024] - - [866, 9883.29] + - [940, 9883.29] - - [4096, 3910, 1, 1024] - - [876, 9800.12] + - [950, 9800.12] - - [1024, 3489, 1, 4096] - - [885, 9398.66] + - [959, 9398.66] - - [4096, 3483, 1, 1024] - - [865, 9715.96] + - [939, 9715.96] - - [4096, 3532, 1, 1024] - - [866, 9837.99] + - [940, 9837.99] - - [64, 101, 624, 101] - - [906, 5452.28] + - [980, 5452.28] - - [4096, 3230, 1, 1024] - - [866, 9683.6] + - [940, 9683.6] - - [4096, 3427, 1, 1024] - - [861, 9760.72] + - [935, 9760.72] - - [1024, 3377, 1, 4096] - - [885, 9101.17] + - [959, 9101.17] - - [1024, 3488, 1, 4096] - - [884, 9381.99] + - [958, 9381.99] - - [1024, 3616, 1, 4096] - - [867, 8709.33] + - [941, 8709.33] - - [1024, 3426, 1, 4096] - - [885, 9229.43] + - [959, 9229.43] - - [4096, 3357, 1, 1024] - - [877, 9668.5] + - [951, 9668.5] - - [4096, 3406, 1, 1024] - - [862, 9748.57] + - [936, 9748.57] - - [1024, 3046, 1, 4096] - - [867, 9590.43] + - [941, 9590.43] - - [1024, 3272, 1, 4096] - - [878, 8930.2] + - [952, 8930.2] - - [1024, 3256, 1, 4096] - - [863, 8828.16] + - [937, 8828.16] - - [4096, 3247, 1, 1024] - - [865, 9741.81] + - [939, 9741.81] - - [4096, 3088, 1, 1024] - - [877, 9589.07] + - [951, 9589.07] - - [1024, 3531, 1, 4096] - - [884, 9501.06] + - [958, 9501.06] - - [64, 160, 400, 160] - - [920, 7334.03] + - [994, 7334.03] - - [4096, 3511, 1, 1024] - - [866, 9789.38] + - [940, 9789.38] - - [1024, 3720, 1, 33708] - - [875, 9214.68] + - [949, 9214.68] - - [1024, 3267, 1, 4096] - - [878, 8831.04] + - [952, 8831.04] - - [1024, 3270, 1, 4096] - - [879, 8876.68] + - [953, 8876.68] - - [1024, 3461, 1, 4096] - - [884, 9327.55] + - [958, 9327.55] - - [4096, 3474, 1, 1024] - - [865, 9697.04] + - [939, 9697.04] - - [4096, 2984, 1, 1024] - - [866, 9674.08] + - [940, 9674.08] - - [1024, 3399, 1, 4096] - - [884, 9158.58] + - [958, 9158.58] - - [4096, 3574, 1, 1024] - - [865, 9942.3] + - [939, 9942.3] - - [1024, 3876, 1, 1024] - - [879, 9085.13] + - [953, 9085.13] - - [4096, 3337, 1, 1024] - - [862, 9611.43] + - [936, 9611.43] - - [4096, 3450, 1, 1024] - - [877, 9930.35] + - [951, 9930.35] - - [1024, 3720, 1, 1024] - - [863, 8755.49] + - [937, 8755.49] - - [1024, 4059, 1, 1024] - - [868, 9366.67] + - [942, 9366.67] - - [4096, 3291, 1, 1024] - - [865, 9856.33] + - [939, 9856.33] - - [64, 93, 688, 93] - - [909, 5497.11] + - [983, 5497.11] - - [4096, 3995, 1, 1024] - - [865, 9776.67] + - [939, 9776.67] - - [64, 147, 432, 147] - - [895, 6233.88] + - [969, 6233.88] - - [4096, 3491, 1, 1024] - - [865, 9742.94] + - [939, 9742.94] - - [4096, 3348, 1, 1024] - - [877, 9634.11] + - [951, 9634.11] - - [4096, 3925, 1, 1024] - - [876, 9848.54] + - [950, 9848.54] - - [4096, 3894, 1, 1024] - - [876, 9812.55] + - [950, 9812.55] - - [1024, 3456, 1, 4096] - - [885, 9317.91] + - [959, 9317.91] - - [1024, 3394, 1, 4096] - - [884, 9148.86] + - [958, 9148.86] - - [64, 100, 624, 102] - - [906, 5416.95] + - [980, 5416.95] - - [4096, 3165, 1, 1024] - - [876, 9743.35] + - [950, 9743.35] - - [4096, 3470, 1, 1024] - - [866, 9691.04] + - [940, 9691.04] - - [1024, 3014, 1, 4096] - - [867, 9486.26] + - [941, 9486.26] - - [1024, 3375, 1, 4096] - - [885, 9082.71] + - [959, 9082.71] - - [4096, 3859, 1, 1024] - - [876, 9738.87] + - [950, 9738.87] - - [4096, 3365, 1, 1024] - - [877, 9694.74] + - [951, 9694.74] - - [1024, 3162, 1, 4096] - - [878, 8550.31] + - [952, 8550.31] - - [1024, 3840, 1, 33708] - - [875, 9409.08] + - [949, 9409.08] - - [1024, 3437, 1, 4096] - - [885, 9270.49] + - [959, 9270.49] - - [4096, 3319, 1, 1024] - - [866, 9927.15] + - [940, 9927.15] - - [1024, 3320, 1, 4096] - - [885, 8962.29] + - [959, 8962.29] - - [64, 23, 2720, 23] - - [908, 2569.53] + - [982, 2569.53] - - [4096, 3328, 1, 1024] - - [865, 9997.41] + - [939, 9997.41] - - [1024, 3235, 1, 4096] - - [885, 8724.31] + - [959, 8724.31] - - [4096, 3282, 1, 1024] - - [866, 9827.13] + - [940, 9827.13] - - [1024, 3367, 1, 4096] - - [878, 9084.02] + - [952, 9084.02] - - [1024, 3542, 1, 4096] - - [885, 9533.1] + - [959, 9533.1] - - [64, 177, 352, 177] - - [871, 6817.91] + - [945, 6817.91] - - [4096, 3145, 1, 1024] - - [862, 9710.28] + - [936, 9710.28] - - [4096, 3514, 1, 1024] - - [865, 9793.06] + - [939, 9793.06] - - [1024, 3432, 1, 4096] - - [885, 9249.39] + - [959, 9249.39] - - [4096, 3409, 1, 1024] - - [861, 9721.6] + - [935, 9721.6] - - [1024, 4012, 1, 33708] - - [865, 9773.35] + - [939, 9773.35] - - [4096, 3876, 1, 1024] - - [862, 9745.65] + - [936, 9745.65] - - [4096, 3299, 1, 1024] - - [865, 9873.53] + - [939, 9873.53] - - [1024, 3168, 1, 4096] - - [878, 8597.13] + - [952, 8597.13] - - [4096, 3681, 1, 1024] - - [877, 9840.03] + - [951, 9840.03] - - [4096, 3531, 1, 1024] - - [866, 9847.76] + - [940, 9847.76] - - [4096, 3388, 1, 1024] - - [877, 9772.28] + - [951, 9772.28] - - [1024, 3720, 1, 4096] - - [866, 8951.6] + - [940, 8951.6] - - [1024, 3332, 1, 4096] - - [885, 8978.97] + - [959, 8978.97] - - [1024, 3273, 1, 4096] - - [879, 8982.49] + - [953, 8982.49] - - [1024, 2935, 1, 4096] - - [868, 9224.89] + - [942, 9224.89] - - [1024, 3467, 1, 4096] - - [882, 9329.33] + - [956, 9329.33] - - [4096, 3542, 1, 1024] - - [865, 9858.51] + - [939, 9858.51] - - [1024, 3130, 1, 4096] - - [864, 8526.66] + - [938, 8526.66] - - [1024, 3405, 1, 4096] - - [885, 9163.44] + - [959, 9163.44] - - [1024, 3960, 1, 1024] - - [863, 9280.36] + - [937, 9280.36] - - [4096, 3405, 1, 1024] - - [876, 9710.2] + - [950, 9710.2] - - [512, 512, 1, 1024] - - [1062, 6670.96] + - [1136, 6670.96] - - [8, 500, 1, 512] - - [958, 228.671] + - [1032, 228.671] - - [512, 512, 1, 2000] - - [1095, 7629.44] + - [1169, 7629.44] - - [32, 512, 1, 512] - - [955, 904.045] + - [1029, 904.045] - - [100, 1024, 1, 2048] - - [1017, 3196.98] + - [1091, 3196.98] - - [8, 512, 1, 500] - - [948, 237.137] + - [1022, 237.137] - - [8, 500, 1, 1024] - - [1012, 289.366] + - [1086, 289.366] - - [100, 2000, 1, 1024] - - [1051, 3368.52] + - [1125, 3368.52] - - [64, 1024, 1, 100] - - [950, 941.709] + - [1024, 941.709] - - [64, 1024, 1, 500] - - [1077, 2659.84] + - [1151, 2659.84] - - [64, 1024, 1, 1024] - - [1015, 2452.91] + - [1089, 2452.91] - - [128, 2000, 1, 100] - - [1071, 2560.1] + - [1145, 2560.1] - - [2, 500, 1, 2048] - - [1012, 72.2127] + - [1086, 72.2127] - - [16, 512, 1, 10] - - [926, 18.3857] + - [1000, 18.3857] - - [64, 2000, 1, 1024] - - [1082, 2800.78] + - [1156, 2800.78] - - [100, 1024, 1, 1024] - - [1010, 3034.17] + - [1084, 3034.17] - - [8, 512, 1, 10] - - [988, 9.24286] + - [1062, 9.24286] - - [16, 500, 1, 2048] - - [1012, 565.846] + - [1086, 565.846] - - [10, 100, 1, 500] - - [948, 58.5112] + - [1022, 58.5112] - - [16, 100, 1, 10] - - [988, 3.67143] + - [1062, 3.67143] - - [500, 1024, 1, 512] - - [1078, 6514.61] + - [1152, 6514.61] - - [128, 1024, 1, 512] - - [1096, 4194.4] + - [1170, 4194.4] - - [512, 500, 1, 2000] - - [1054, 7347.98] + - [1128, 7347.98] - - [2, 100, 1, 2000] - - [948, 20.9333] + - [1022, 20.9333] - - [500, 512, 1, 100] - - [1070, 2539.78] + - [1144, 2539.78] - - [100, 1024, 1, 500] - - [1096, 3216.18] + - [1170, 3216.18] - - [256, 100, 1, 2048] - - [1106, 1689.17] + - [1180, 1689.17] - - [2, 512, 1, 512] - - [962, 50.5123] + - [1036, 50.5123] - - [128, 2000, 1, 512] - - [1082, 4641.46] + - [1156, 4641.46] - - [2, 100, 1, 10] - - [926, 0.496825] + - [1000, 0.496825] - - [16, 2000, 1, 2048] - - [970, 1266.25] + - [1044, 1266.25] - - [200, 100, 1, 100] - - [1116, 316.556] + - [1190, 316.556] - - [256, 1024, 1, 100] - - [1072, 2686.0] + - [1146, 2686.0] - - [200, 500, 1, 1024] - - [1121, 3282.15] + - [1195, 3282.15] - - [500, 100, 1, 100] - - [1035, 631.413] + - [1109, 631.413] - - [4, 100, 1, 10] - - [933, 0.977193] + - [1007, 0.977193] - - [32, 100, 1, 512] - - [1012, 198.935] + - [1086, 198.935] - - [100, 2000, 1, 512] - - [1082, 3832.44] + - [1156, 3832.44] - - [16, 1024, 1, 512] - - [996, 794.476] + - [1070, 794.476] - - [200, 512, 1, 100] - - [1114, 1306.22] + - [1188, 1306.22] - - [4, 1024, 1, 1024] - - [955, 213.225] + - [1029, 213.225] - - [512, 1024, 1, 512] - - [1079, 7049.35] + - [1153, 7049.35] - - [4, 512, 1, 10] - - [987, 4.59123] + - [1061, 4.59123] - - [2, 2048, 1, 2000] - - [948, 300.393] + - [1022, 300.393] - - [64, 2048, 1, 10] - - [1108, 241.041] + - [1182, 241.041] - - [128, 100, 1, 10] - - [1113, 27.6862] + - [1187, 27.6862] - - [4, 512, 1, 2048] - - [948, 146.549] + - [1022, 146.549] - - [64, 2048, 1, 500] - - [1088, 4015.79] + - [1162, 4015.79] - - [512, 512, 1, 512] - - [1043, 6123.17] + - [1117, 6123.17] - - [500, 500, 1, 2000] - - [1054, 7126.67] + - [1128, 7126.67] - - [10, 1024, 1, 2000] - - [1021, 807.671] + - [1095, 807.671] - - [256, 100, 1, 100] - - [1033, 296.396] + - [1107, 296.396] - - [32, 2000, 1, 2048] - - [976, 2167.3] + - [1050, 2167.3] - - [64, 1024, 1, 2048] - - [1009, 2383.23] + - [1083, 2383.23] - - [200, 2048, 1, 512] - - [1084, 5264.04] + - [1158, 5264.04] - - [256, 500, 1, 10] - - [1066, 210.626] + - [1140, 210.626] - - [16, 1024, 1, 100] - - [946, 262.664] + - [1020, 262.664] - - [32, 1024, 1, 1024] - - [951, 1476.97] + - [1025, 1476.97] - - [512, 500, 1, 512] - - [1040, 5851.53] + - [1114, 5851.53] - - [128, 1024, 1, 2000] - - [1124, 5516.6] + - [1198, 5516.6] - - [8, 100, 1, 500] - - [948, 46.3963] + - [1022, 46.3963] - - [100, 2000, 1, 2048] - - [1103, 3715.63] + - [1177, 3715.63] - - [10, 512, 1, 512] - - [958, 292.671] + - [1032, 292.671] - - [8, 500, 1, 10] - - [987, 8.87193] + - [1061, 8.87193] - - [10, 2000, 1, 1024] - - [1001, 640.1] + - [1075, 640.1] - - [16, 1024, 1, 10] - - [986, 36.6714] + - [1060, 36.6714] - - [16, 512, 1, 2048] - - [965, 585.897] + - [1039, 585.897] - - [256, 512, 1, 10] - - [1031, 230.861] + - [1105, 230.861] - - [2, 2000, 1, 100] - - [993, 64.2026] + - [1067, 64.2026] - - [128, 512, 1, 2048] - - [960, 3106.99] + - [1034, 3106.99] - - [128, 512, 1, 100] - - [953, 952.658] + - [1027, 952.658] - - [512, 2000, 1, 1024] - - [1050, 8066.07] + - [1124, 8066.07] - - [64, 500, 1, 2048] - - [1119, 1857.7] + - [1193, 1857.7] - - [64, 2000, 1, 2048] - - [1101, 3442.12] + - [1175, 3442.12] - - [64, 2048, 1, 512] - - [1102, 3315.76] + - [1176, 3315.76] - - [10, 2000, 1, 512] - - [948, 785.376] + - [1022, 785.376] - - [32, 2000, 1, 500] - - [951, 2500.1] + - [1025, 2500.1] - - [64, 2000, 1, 10] - - [939, 231.984] + - [1013, 231.984] - - [500, 100, 1, 10] - - [1036, 88.1282] + - [1110, 88.1282] - - [128, 1024, 1, 500] - - [1087, 4096.1] + - [1161, 4096.1] - - [64, 100, 1, 2048] - - [948, 587.34] + - [1022, 587.34] - - [64, 100, 1, 10] - - [1107, 12.0403] + - [1181, 12.0403] - - [16, 512, 1, 500] - - [958, 461.361] + - [1032, 461.361] - - [32, 2000, 1, 1024] - - [945, 1713.91] + - [1019, 1713.91] - - [200, 512, 1, 1024] - - [1124, 3244.46] + - [1198, 3244.46] - - [128, 2048, 1, 10] - - [940, 455.211] + - [1014, 455.211] - - [200, 100, 1, 2000] - - [948, 1462.09] + - [1022, 1462.09] - - [2, 100, 1, 512] - - [948, 12.5272] + - [1022, 12.5272] - - [64, 2048, 1, 100] - - [1114, 1689.17] + - [1188, 1689.17] - - [32, 512, 1, 100] - - [947, 266.074] + - [1021, 266.074] - - [16, 512, 1, 1024] - - [1012, 569.978] + - [1086, 569.978] - - [4, 1024, 1, 512] - - [1002, 208.151] + - [1076, 208.151] - - [64, 2000, 1, 100] - - [1114, 1649.58] + - [1188, 1649.58] - - [512, 2048, 1, 512] - - [1050, 7849.09] + - [1124, 7849.09] - - [2, 500, 1, 500] - - [936, 53.5188] + - [1010, 53.5188] - - [32, 100, 1, 100] - - [947, 57.2429] + - [1021, 57.2429] - - [100, 500, 1, 2000] - - [951, 2784.06] + - [1025, 2784.06] - - [200, 2000, 1, 100] - - [1023, 2994.11] + - [1097, 2994.11] - - [10, 512, 1, 10] - - [983, 11.1345] + - [1057, 11.1345] - - [100, 500, 1, 2048] - - [1123, 2361.72] + - [1197, 2361.72] - - [4, 2048, 1, 500] - - [958, 379.359] + - [1032, 379.359] - - [200, 500, 1, 100] - - [1084, 1288.76] + - [1158, 1288.76] - - [500, 500, 1, 500] - - [1040, 5425.45] + - [1114, 5425.45] - - [2, 100, 1, 1024] - - [1012, 16.3025] + - [1086, 16.3025] - - [128, 2048, 1, 512] - - [1098, 4699.6] + - [1172, 4699.6] - - [200, 2000, 1, 1024] - - [1048, 4621.04] + - [1122, 4621.04] - - [32, 512, 1, 1024] - - [1011, 1028.12] + - [1085, 1028.12] - - [100, 2048, 1, 500] - - [1072, 4142.49] + - [1146, 4142.49] - - [256, 100, 1, 1024] - - [1102, 1443.62] + - [1176, 1443.62] - - [16, 2000, 1, 500] - - [997, 1428.67] + - [1071, 1428.67] - - [128, 100, 1, 100] - - [947, 213.433] + - [1021, 213.433] - - [500, 500, 1, 2048] - - [1044, 6639.1] + - [1118, 6639.1] - - [32, 512, 1, 10] - - [980, 36.0298] + - [1054, 36.0298] - - [128, 100, 1, 1024] - - [1008, 791.598] + - [1082, 791.598] - - [16, 500, 1, 2000] - - [1021, 694.544] + - [1095, 694.544] - - [4, 2048, 1, 100] - - [992, 129.72] + - [1066, 129.72] - - [64, 500, 1, 500] - - [934, 1333.43] + - [1008, 1333.43] - - [500, 1024, 1, 2048] - - [1053, 7031.86] + - [1127, 7031.86] - - [512, 2048, 1, 100] - - [1028, 5285.26] + - [1102, 5285.26] - - [128, 512, 1, 1024] - - [1120, 2519.2] + - [1194, 2519.2] - - [128, 512, 1, 2000] - - [1118, 3608.91] + - [1192, 3608.91] - - [128, 2000, 1, 2000] - - [1091, 7017.64] + - [1165, 7017.64] - - [2, 512, 1, 10] - - [984, 2.13175] + - [1058, 2.13175] - - [10, 512, 1, 500] - - [948, 293.678] + - [1022, 293.678] - - [4, 1024, 1, 2000] - - [968, 326.215] + - [1042, 326.215] - - [256, 100, 1, 2000] - - [1105, 1768.06] + - [1179, 1768.06] - - [512, 2048, 1, 2000] - - [1050, 8674.62] + - [1124, 8674.62] - - [100, 100, 1, 10] - - [1112, 21.6517] + - [1186, 21.6517] - - [256, 500, 1, 1024] - - [1052, 4833.14] + - [1126, 4833.14] - - [128, 512, 1, 10] - - [940, 132.229] + - [1014, 132.229] - - [256, 100, 1, 500] - - [1099, 914.386] + - [1173, 914.386] - - [64, 100, 1, 512] - - [1006, 369.109] + - [1080, 369.109] - - [64, 512, 1, 500] - - [948, 1600.1] + - [1022, 1600.1] - - [64, 2048, 1, 2000] - - [1102, 5925.6] + - [1176, 5925.6] - - [100, 2048, 1, 1024] - - [1060, 3260.6] + - [1134, 3260.6] - - [200, 2000, 1, 10] - - [940, 595.338] + - [1014, 595.338] - - [128, 1024, 1, 100] - - [1084, 1689.17] + - [1158, 1689.17] - - [16, 2000, 1, 100] - - [947, 493.927] + - [1021, 493.927] - - [8, 100, 1, 512] - - [948, 49.8087] + - [1022, 49.8087] - - [500, 2048, 1, 1024] - - [1050, 7651.71] + - [1124, 7651.71] - - [500, 2000, 1, 10] - - [1038, 1008.16] + - [1112, 1008.16] - - [32, 100, 1, 500] - - [1012, 187.016] + - [1086, 187.016] - - [256, 1024, 1, 2048] - - [1053, 6190.95] + - [1127, 6190.95] - - [32, 500, 1, 2048] - - [948, 1083.7] + - [1022, 1083.7] - - [4, 2000, 1, 10] - - [991, 17.6439] + - [1065, 17.6439] - - [128, 500, 1, 2000] - - [1008, 3516.58] + - [1082, 3516.58] - - [8, 1024, 1, 10] - - [982, 18.0649] + - [1056, 18.0649] - - [2, 500, 1, 100] - - [927, 16.1256] + - [1001, 16.1256] - - [10, 500, 1, 512] - - [948, 291.009] + - [1022, 291.009] - - [10, 2000, 1, 10] - - [926, 38.5615] + - [1000, 38.5615] - - [500, 512, 1, 512] - - [1043, 5893.63] + - [1117, 5893.63] - - [32, 500, 1, 500] - - [948, 892.957] + - [1022, 892.957] - - [256, 500, 1, 2000] - - [1057, 6237.92] + - [1131, 6237.92] - - [100, 500, 1, 100] - - [959, 726.844] + - [1033, 726.844] - - [500, 2048, 1, 100] - - [1032, 4867.02] + - [1106, 4867.02] - - [10, 1024, 1, 512] - - [948, 520.227] + - [1022, 520.227] - - [2, 2048, 1, 512] - - [958, 151.628] + - [1032, 151.628] - - [256, 512, 1, 100] - - [1037, 1590.78] + - [1111, 1590.78] - - [10, 2048, 1, 100] - - [948, 324.151] + - [1022, 324.151] - - [8, 2048, 1, 100] - - [1003, 256.1] + - [1077, 256.1] - - [512, 100, 1, 512] - - [1099, 2100.61] + - [1173, 2100.61] - - [4, 500, 1, 500] - - [948, 115.841] + - [1022, 115.841] - - [64, 100, 1, 1024] - - [948, 450.21] + - [1022, 450.21] - - [2, 2048, 1, 1024] - - [1005, 137.708] + - [1079, 137.708] - - [2, 500, 1, 2000] - - [974, 90.3527] + - [1048, 90.3527] - - [512, 1024, 1, 500] - - [1079, 6898.63] + - [1153, 6898.63] - - [128, 2000, 1, 500] - - [1084, 5161.39] + - [1158, 5161.39] - - [32, 512, 1, 2048] - - [1018, 1103.86] + - [1092, 1103.86] - - [10, 100, 1, 2000] - - [948, 106.032] + - [1022, 106.032] - - [4, 100, 1, 512] - - [948, 24.7154] + - [1022, 24.7154] - - [2, 512, 1, 2048] - - [1012, 73.3246] + - [1086, 73.3246] - - [200, 512, 1, 2048] - - [1124, 3954.01] + - [1198, 3954.01] - - [200, 2000, 1, 2000] - - [1086, 6230.63] + - [1160, 6230.63] - - [100, 100, 1, 2000] - - [948, 827.915] + - [1022, 827.915] - - [500, 2048, 1, 2000] - - [1049, 8388.04] + - [1123, 8388.04] - - [64, 2048, 1, 2048] - - [1094, 3406.64] + - [1168, 3406.64] - - [16, 2000, 1, 1024] - - [954, 1024.1] + - [1028, 1024.1] - - [512, 2048, 1, 1024] - - [1027, 8061.22] + - [1101, 8061.22] - - [10, 500, 1, 500] - - [958, 284.191] + - [1032, 284.191] - - [200, 1024, 1, 2048] - - [1122, 4886.29] + - [1196, 4886.29] - - [10, 2000, 1, 2000] - - [948, 1449.38] + - [1022, 1449.38] - - [8, 2000, 1, 500] - - [997, 719.524] + - [1071, 719.524] - - [2, 100, 1, 2048] - - [1012, 19.945] + - [1086, 19.945] - - [32, 100, 1, 2048] - - [1012, 323.894] + - [1086, 323.894] - - [512, 512, 1, 10] - - [1069, 420.203] + - [1143, 420.203] - - [512, 500, 1, 10] - - [1074, 376.571] + - [1148, 376.571] - - [16, 100, 1, 1024] - - [958, 129.72] + - [1032, 129.72] - - [2, 500, 1, 10] - - [922, 2.21864] + - [996, 2.21864] - - [200, 512, 1, 10] - - [924, 188.335] + - [998, 188.335] - - [512, 1024, 1, 100] - - [1024, 3877.97] + - [1098, 3877.97] - - [16, 2000, 1, 2000] - - [948, 2222.32] + - [1022, 2222.32] - - [500, 500, 1, 1024] - - [1044, 6130.37] + - [1118, 6130.37] - - [500, 100, 1, 2048] - - [1099, 2949.41] + - [1173, 2949.41] - - [256, 1024, 1, 512] - - [1063, 5886.84] + - [1137, 5886.84] - - [256, 500, 1, 512] - - [1041, 4380.85] + - [1115, 4380.85] - - [16, 1024, 1, 2000] - - [1012, 1208.36] + - [1086, 1208.36] - - [200, 500, 1, 2048] - - [1124, 3855.52] + - [1198, 3855.52] - - [256, 2000, 1, 10] - - [1026, 727.373] + - [1100, 727.373] - - [10, 2048, 1, 2048] - - [979, 823.158] + - [1053, 823.158] - - [512, 2000, 1, 100] - - [1028, 5120.1] + - [1102, 5120.1] - - [10, 1024, 1, 1024] - - [955, 553.146] + - [1029, 553.146] - - [512, 2000, 1, 2048] - - [1056, 7563.4] + - [1130, 7563.4] - - [500, 1024, 1, 500] - - [1080, 6570.94] + - [1154, 6570.94] - - [500, 100, 1, 512] - - [1099, 2038.32] + - [1173, 2038.32] - - [256, 2000, 1, 100] - - [1048, 3764.81] + - [1122, 3764.81] - - [512, 1024, 1, 2048] - - [1092, 7286.62] + - [1166, 7286.62] - - [32, 512, 1, 500] - - [948, 898.346] + - [1022, 898.346] - - [100, 2000, 1, 10] - - [940, 333.433] + - [1014, 333.433] - - [100, 500, 1, 512] - - [1118, 2176.97] + - [1192, 2176.97] - - [8, 2000, 1, 512] - - [997, 602.453] + - [1071, 602.453] - - [100, 2048, 1, 2048] - - [1104, 3694.87] + - [1178, 3694.87] - - [128, 1024, 1, 2048] - - [1123, 4168.35] + - [1197, 4168.35] - - [8, 500, 1, 2000] - - [1022, 352.213] + - [1096, 352.213] - - [100, 2000, 1, 500] - - [1072, 4045.41] + - [1146, 4045.41] - - [100, 2048, 1, 100] - - [1072, 2081.4] + - [1146, 2081.4] - - [4, 100, 1, 1024] - - [948, 33.1323] + - [1022, 33.1323] - - [500, 2048, 1, 2048] - - [1056, 7765.03] + - [1130, 7765.03] - - [2, 2000, 1, 2048] - - [967, 166.334] + - [1041, 166.334] - - [200, 2048, 1, 10] - - [941, 609.624] + - [1015, 609.624] - - [2, 500, 1, 1024] - - [1012, 75.3941] + - [1086, 75.3941] - - [100, 500, 1, 1024] - - [1008, 1975.41] + - [1082, 1975.41] - - [16, 2048, 1, 500] - - [948, 1473.48] + - [1022, 1473.48] - - [100, 1024, 1, 10] - - [1108, 185.607] + - [1182, 185.607] - - [8, 2048, 1, 1024] - - [1004, 543.404] + - [1078, 543.404] - - [2, 2000, 1, 500] - - [948, 179.956] + - [1022, 179.956] - - [32, 100, 1, 1024] - - [948, 267.812] + - [1022, 267.812] - - [500, 2000, 1, 512] - - [1078, 7087.59] + - [1152, 7087.59] - - [64, 100, 1, 2000] - - [958, 615.485] + - [1032, 615.485] - - [100, 1024, 1, 2000] - - [1121, 4224.52] + - [1195, 4224.52] - - [64, 500, 1, 10] - - [923, 63.5921] + - [997, 63.5921] - - [32, 2048, 1, 100] - - [944, 941.709] + - [1018, 941.709] - - [64, 500, 1, 512] - - [948, 1575.48] + - [1022, 1575.48] - - [10, 100, 1, 1024] - - [958, 82.6806] + - [1032, 82.6806] - - [16, 512, 1, 100] - - [947, 148.506] + - [1021, 148.506] - - [4, 100, 1, 2000] - - [1021, 43.9597] + - [1095, 43.9597] - - [2, 512, 1, 1024] - - [1012, 74.152] + - [1086, 74.152] - - [64, 512, 1, 1024] - - [1013, 1571.0] + - [1087, 1571.0] - - [10, 2048, 1, 500] - - [948, 920.963] + - [1022, 920.963] - - [4, 2000, 1, 2048] - - [967, 326.215] + - [1041, 326.215] - - [512, 100, 1, 2048] - - [1102, 3084.15] + - [1176, 3084.15] - - [32, 100, 1, 2000] - - [948, 343.448] + - [1022, 343.448] - - [256, 512, 1, 500] - - [1041, 4311.68] + - [1115, 4311.68] - - [100, 2000, 1, 100] - - [1072, 2016.23] + - [1146, 2016.23] - - [8, 2000, 1, 1024] - - [961, 544.781] + - [1035, 544.781] - - [4, 512, 1, 500] - - [948, 118.619] + - [1022, 118.619] - - [128, 1024, 1, 10] - - [1111, 244.637] + - [1185, 244.637] - - [4, 500, 1, 1024] - - [948, 144.733] + - [1022, 144.733] - - [32, 2048, 1, 512] - - [951, 2140.05] + - [1025, 2140.05] - - [32, 100, 1, 10] - - [926, 7.11754] + - [1000, 7.11754] - - [100, 2048, 1, 10] - - [1115, 341.433] + - [1189, 341.433] - - [512, 500, 1, 100] - - [1076, 2461.64] + - [1150, 2461.64] - - [128, 2000, 1, 1024] - - [1060, 4174.37] + - [1134, 4174.37] - - [200, 1024, 1, 500] - - [1072, 4295.4] + - [1146, 4295.4] - - [32, 2048, 1, 1024] - - [975, 1667.82] + - [1049, 1667.82] - - [10, 1024, 1, 2048] - - [966, 555.49] + - [1040, 555.49] - - [8, 500, 1, 100] - - [947, 71.5286] + - [1021, 71.5286] - - [32, 2048, 1, 500] - - [951, 2528.5] + - [1025, 2528.5] - - [200, 100, 1, 1024] - - [960, 1071.23] + - [1034, 1071.23] - - [16, 100, 1, 100] - - [937, 28.6714] + - [1011, 28.6714] - - [8, 1024, 1, 2000] - - [1021, 654.413] + - [1095, 654.413] - - [4, 512, 1, 100] - - [947, 36.6714] + - [1021, 36.6714] - - [16, 500, 1, 100] - - [947, 142.957] + - [1021, 142.957] - - [8, 1024, 1, 2048] - - [973, 441.606] + - [1047, 441.606] - - [16, 1024, 1, 2048] - - [974, 886.845] + - [1048, 886.845] - - [10, 2048, 1, 1024] - - [952, 639.476] + - [1026, 639.476] - - [64, 512, 1, 100] - - [947, 518.581] + - [1021, 518.581] - - [2, 100, 1, 500] - - [948, 9.71538] + - [1022, 9.71538] - - [2, 500, 1, 512] - - [954, 48.2203] + - [1028, 48.2203] - - [256, 512, 1, 2000] - - [1057, 6450.49] + - [1131, 6450.49] - - [128, 500, 1, 1024] - - [951, 2497.66] + - [1025, 2497.66] - - [10, 100, 1, 10] - - [988, 2.33214] + - [1062, 2.33214] - - [8, 2048, 1, 2048] - - [938, 643.398] + - [1012, 643.398] - - [16, 2048, 1, 2048] - - [978, 1338.0] + - [1052, 1338.0] - - [64, 1024, 1, 10] - - [941, 132.229] + - [1015, 132.229] - - [500, 100, 1, 500] - - [1099, 1941.09] + - [1173, 1941.09] - - [256, 1024, 1, 2000] - - [1095, 7629.44] + - [1169, 7629.44] - - [200, 512, 1, 500] - - [1084, 3232.42] + - [1158, 3232.42] - - [8, 2000, 1, 10] - - [985, 32.3581] + - [1059, 32.3581] - - [64, 2000, 1, 512] - - [1083, 3225.3] + - [1157, 3225.3] - - [2, 512, 1, 100] - - [927, 16.7234] + - [1001, 16.7234] - - [4, 2000, 1, 2000] - - [948, 586.61] + - [1022, 586.61] - - [200, 1024, 1, 100] - - [1072, 2133.43] + - [1146, 2133.43] - - [16, 100, 1, 500] - - [1012, 92.6926] + - [1086, 92.6926] - - [128, 100, 1, 500] - - [1008, 526.416] + - [1082, 526.416] - - [500, 1024, 1, 1024] - - [1042, 7201.86] + - [1116, 7201.86] - - [200, 1024, 1, 1024] - - [1094, 4519.82] + - [1168, 4519.82] - - [8, 2048, 1, 512] - - [958, 624.252] + - [1032, 624.252] - - [200, 2000, 1, 500] - - [1048, 5186.82] + - [1122, 5186.82] - - [512, 100, 1, 1024] - - [1099, 2742.19] + - [1173, 2742.19] - - [16, 100, 1, 2000] - - [958, 168.876] + - [1032, 168.876] - - [500, 512, 1, 2000] - - [1095, 7289.39] + - [1169, 7289.39] - - [8, 2000, 1, 2048] - - [969, 668.289] + - [1043, 668.289] - - [256, 2048, 1, 100] - - [1030, 3924.41] + - [1104, 3924.41] - - [32, 2048, 1, 2000] - - [962, 3882.56] + - [1036, 3882.56] - - [200, 500, 1, 512] - - [1087, 3368.52] + - [1161, 3368.52] - - [10, 512, 1, 100] - - [947, 91.5286] + - [1021, 91.5286] - - [16, 2000, 1, 10] - - [925, 61.6385] + - [999, 61.6385] - - [8, 512, 1, 100] - - [947, 72.2127] + - [1021, 72.2127] - - [256, 512, 1, 512] - - [1052, 4584.04] + - [1126, 4584.04] - - [500, 2000, 1, 1024] - - [1027, 7569.59] + - [1101, 7569.59] - - [512, 512, 1, 500] - - [1043, 5708.81] + - [1117, 5708.81] - - [256, 2048, 1, 1024] - - [1067, 5923.21] + - [1141, 5923.21] - - [8, 2048, 1, 2000] - - [948, 1153.9] + - [1022, 1153.9] - - [100, 512, 1, 2048] - - [1014, 2383.23] + - [1088, 2383.23] - - [100, 1024, 1, 512] - - [1099, 3343.77] + - [1173, 3343.77] - - [128, 100, 1, 2000] - - [1117, 1084.85] + - [1191, 1084.85] - - [4, 2048, 1, 2048] - - [966, 332.454] + - [1040, 332.454] - - [2, 1024, 1, 2000] - - [977, 161.106] + - [1051, 161.106] - - [100, 512, 1, 512] - - [951, 2184.63] + - [1025, 2184.63] - - [128, 1024, 1, 1024] - - [1094, 3848.09] + - [1168, 3848.09] - - [200, 2048, 1, 1024] - - [1029, 4547.26] + - [1103, 4547.26] - - [32, 1024, 1, 2000] - - [958, 2416.62] + - [1032, 2416.62] - - [128, 500, 1, 100] - - [953, 919.64] + - [1027, 919.64] - - [200, 512, 1, 2000] - - [1121, 4238.51] + - [1195, 4238.51] - - [10, 2048, 1, 2000] - - [958, 1454.65] + - [1032, 1454.65] - - [256, 1024, 1, 500] - - [1055, 5669.3] + - [1129, 5669.3] - - [100, 100, 1, 100] - - [947, 171.333] + - [1021, 171.333] - - [8, 512, 1, 1024] - - [1016, 286.596] + - [1090, 286.596] - - [200, 1024, 1, 512] - - [1072, 4354.65] + - [1146, 4354.65] - - [256, 500, 1, 500] - - [1057, 4020.2] + - [1131, 4020.2] - - [200, 100, 1, 500] - - [1121, 702.347] + - [1195, 702.347] - - [2, 1024, 1, 2048] - - [967, 112.85] + - [1041, 112.85] - - [256, 500, 1, 2048] - - [1057, 5041.33] + - [1131, 5041.33] - - [512, 2048, 1, 500] - - [1050, 7710.22] + - [1124, 7710.22] - - [512, 100, 1, 2000] - - [1099, 3099.37] + - [1173, 3099.37] - - [512, 500, 1, 1024] - - [1058, 6463.22] + - [1132, 6463.22] - - [16, 512, 1, 2000] - - [974, 721.227] + - [1048, 721.227] - - [64, 500, 1, 1024] - - [1013, 1528.46] + - [1087, 1528.46] - - [512, 2000, 1, 10] - - [1034, 1174.41] + - [1108, 1174.41] - - [256, 512, 1, 1024] - - [1052, 4978.5] + - [1126, 4978.5] - - [10, 512, 1, 1024] - - [1012, 370.36] + - [1086, 370.36] - - [512, 100, 1, 100] - - [1035, 659.894] + - [1109, 659.894] - - [8, 2000, 1, 100] - - [947, 256.51] + - [1021, 256.51] - - [128, 2048, 1, 1024] - - [1060, 4173.54] + - [1134, 4173.54] - - [2, 2000, 1, 2000] - - [948, 250.727] + - [1022, 250.727] - - [16, 2048, 1, 1024] - - [995, 1046.06] + - [1069, 1046.06] - - [500, 512, 1, 500] - - [1040, 5517.34] + - [1114, 5517.34] - - [8, 100, 1, 1024] - - [1013, 64.1] + - [1087, 64.1] - - [10, 100, 1, 100] - - [937, 17.9571] + - [1011, 17.9571] - - [200, 500, 1, 500] - - [1087, 3140.8] + - [1161, 3140.8] - - [10, 500, 1, 2000] - - [974, 444.94] + - [1048, 444.94] - - [500, 100, 1, 2000] - - [1102, 2969.22] + - [1176, 2969.22] - - [100, 512, 1, 2000] - - [1014, 2776.67] + - [1088, 2776.67] - - [500, 1024, 1, 2000] - - [1093, 8020.15] + - [1167, 8020.15] - - [32, 2000, 1, 2000] - - [954, 3827.85] + - [1028, 3827.85] - - [64, 1024, 1, 512] - - [1118, 2573.29] + - [1192, 2573.29] - - [64, 2000, 1, 2000] - - [1087, 5797.2] + - [1161, 5797.2] - - [32, 500, 1, 100] - - [947, 266.767] + - [1021, 266.767] - - [128, 2000, 1, 2048] - - [1103, 4548.05] + - [1177, 4548.05] - - [10, 100, 1, 2048] - - [1012, 98.5615] + - [1086, 98.5615] - - [32, 2048, 1, 2048] - - [975, 2213.45] + - [1049, 2213.45] - - [64, 100, 1, 100] - - [948, 96.4855] + - [1022, 96.4855] - - [2, 1024, 1, 100] - - [998, 34.6946] + - [1072, 34.6946] - - [256, 1024, 1, 10] - - [1068, 425.658] + - [1142, 425.658] - - [256, 1024, 1, 1024] - - [1061, 5482.85] + - [1135, 5482.85] - - [64, 500, 1, 2000] - - [948, 2056.66] + - [1022, 2056.66] - - [512, 2000, 1, 512] - - [1046, 7550.33] + - [1120, 7550.33] - - [8, 512, 1, 512] - - [955, 232.086] + - [1029, 232.086] - - [8, 512, 1, 2048] - - [948, 290.564] + - [1022, 290.564] - - [100, 100, 1, 1024] - - [1118, 624.49] + - [1192, 624.49] - - [2, 2048, 1, 10] - - [991, 8.92759] + - [1065, 8.92759] - - [4, 2048, 1, 512] - - [997, 312.176] + - [1071, 312.176] - - [4, 2048, 1, 10] - - [990, 18.0649] + - [1064, 18.0649] - - [8, 100, 1, 2000] - - [967, 85.9369] + - [1041, 85.9369] - - [2, 1024, 1, 1024] - - [964, 101.314] + - [1038, 101.314] - - [16, 2048, 1, 100] - - [948, 518.581] + - [1022, 518.581] - - [16, 512, 1, 512] - - [958, 456.003] + - [1032, 456.003] - - [32, 500, 1, 512] - - [955, 906.295] + - [1029, 906.295] - - [500, 2000, 1, 2000] - - [1050, 8143.42] + - [1124, 8143.42] - - [500, 1024, 1, 10] - - [1031, 680.951] + - [1105, 680.951] - - [32, 500, 1, 1024] - - [1007, 1008.97] + - [1081, 1008.97] - - [32, 500, 1, 10] - - [943, 33.4333] + - [1017, 33.4333] - - [500, 500, 1, 10] - - [1072, 367.747] + - [1146, 367.747] - - [4, 2000, 1, 500] - - [958, 370.47] + - [1032, 370.47] - - [10, 2000, 1, 500] - - [948, 899.381] + - [1022, 899.381] - - [32, 2000, 1, 512] - - [960, 2089.9] + - [1034, 2089.9] - - [256, 500, 1, 100] - - [1073, 1495.43] + - [1147, 1495.43] - - [256, 2048, 1, 10] - - [1031, 789.69] + - [1105, 789.69] - - [4, 1024, 1, 500] - - [948, 222.709] + - [1022, 222.709] - - [256, 512, 1, 2048] - - [1057, 5292.6] + - [1131, 5292.6] - - [2, 2000, 1, 1024] - - [995, 137.365] + - [1069, 137.365] - - [256, 100, 1, 512] - - [1099, 1085.13] + - [1173, 1085.13] - - [8, 1024, 1, 500] - - [948, 441.479] + - [1022, 441.479] - - [256, 2048, 1, 500] - - [1078, 7031.86] + - [1152, 7031.86] - - [256, 2048, 1, 2048] - - [1041, 6771.93] + - [1115, 6771.93] - - [2, 2000, 1, 512] - - [1002, 159.106] + - [1076, 159.106] - - [256, 2000, 1, 512] - - [1045, 6527.59] + - [1119, 6527.59] - - [4, 1024, 1, 100] - - [994, 70.237] + - [1068, 70.237] - - [512, 1024, 1, 2000] - - [1079, 8295.8] + - [1153, 8295.8] - - [100, 500, 1, 500] - - [951, 2016.23] + - [1025, 2016.23] - - [4, 2048, 1, 1024] - - [999, 285.039] + - [1073, 285.039] - - [2, 1024, 1, 500] - - [948, 109.502] + - [1022, 109.502] - - [64, 100, 1, 500] - - [948, 296.396] + - [1022, 296.396] - - [256, 2000, 1, 2000] - - [1056, 8152.97] + - [1130, 8152.97] - - [2, 512, 1, 500] - - [954, 44.8552] + - [1028, 44.8552] - - [8, 2048, 1, 500] - - [948, 736.791] + - [1022, 736.791] - - [10, 1024, 1, 500] - - [948, 547.109] + - [1022, 547.109] - - [4, 2048, 1, 2000] - - [958, 604.23] + - [1032, 604.23] - - [200, 1024, 1, 2000] - - [1125, 5400.94] + - [1199, 5400.94] - - [128, 500, 1, 512] - - [1118, 2730.77] + - [1192, 2730.77] - - [10, 500, 1, 2048] - - [1012, 359.651] + - [1086, 359.651] - - [256, 2048, 1, 2000] - - [1056, 8375.31] + - [1130, 8375.31] - - [8, 2000, 1, 2000] - - [958, 1146.23] + - [1032, 1146.23] - - [100, 2048, 1, 512] - - [1081, 3936.2] + - [1155, 3936.2] - - [512, 500, 1, 2048] - - [1057, 6756.39] + - [1131, 6756.39] - - [200, 2048, 1, 100] - - [1048, 3180.22] + - [1122, 3180.22] - - [128, 512, 1, 512] - - [951, 2872.91] + - [1025, 2872.91] - - [200, 2000, 1, 2048] - - [1097, 4818.92] + - [1171, 4818.92] - - [4, 2000, 1, 1024] - - [995, 275.369] + - [1069, 275.369] - - [64, 512, 1, 10] - - [1110, 69.5237] + - [1184, 69.5237] - - [32, 500, 1, 2000] - - [977, 1246.21] + - [1051, 1246.21] - - [128, 2048, 1, 2000] - - [1090, 7233.65] + - [1164, 7233.65] - - [100, 100, 1, 2048] - - [948, 790.223] + - [1022, 790.223] - - [500, 2048, 1, 512] - - [1078, 7249.66] + - [1152, 7249.66] - - [200, 100, 1, 512] - - [954, 748.638] + - [1028, 748.638] - - [32, 2000, 1, 100] - - [949, 930.333] + - [1023, 930.333] - - [500, 512, 1, 2048] - - [1100, 6640.02] + - [1174, 6640.02] - - [500, 2000, 1, 500] - - [1080, 7078.24] + - [1154, 7078.24] - - [200, 100, 1, 2048] - - [958, 1387.63] + - [1032, 1387.63] - - [2, 2048, 1, 100] - - [992, 64.9101] + - [1066, 64.9101] - - [8, 100, 1, 10] - - [933, 1.85439] + - [1007, 1.85439] - - [200, 2048, 1, 2048] - - [1097, 5022.02] + - [1171, 5022.02] - - [200, 2048, 1, 500] - - [1048, 5355.75] + - [1122, 5355.75] - - [100, 100, 1, 500] - - [1118, 416.767] + - [1192, 416.767] - - [8, 2048, 1, 10] - - [989, 34.8119] + - [1063, 34.8119] - - [100, 500, 1, 10] - - [929, 93.3836] + - [1003, 93.3836] - - [200, 500, 1, 2000] - - [1121, 4152.92] + - [1195, 4152.92] - - [512, 2000, 1, 500] - - [1050, 7485.48] + - [1124, 7485.48] - - [10, 500, 1, 1024] - - [1016, 363.736] + - [1090, 363.736] - - [256, 100, 1, 10] - - [1065, 41.1256] + - [1139, 41.1256] - - [500, 512, 1, 1024] - - [1044, 6362.82] + - [1118, 6362.82] - - [200, 2048, 1, 2000] - - [1086, 6321.09] + - [1160, 6321.09] - - [100, 1024, 1, 100] - - [1085, 1306.22] + - [1159, 1306.22] - - [500, 1024, 1, 100] - - [1024, 3699.52] + - [1098, 3699.52] - - [10, 512, 1, 2048] - - [948, 361.18] + - [1022, 361.18] - - [2, 1024, 1, 512] - - [997, 105.803] + - [1071, 105.803] - - [4, 500, 1, 2048] - - [1020, 143.517] + - [1094, 143.517] - - [100, 512, 1, 100] - - [953, 744.286] + - [1027, 744.286] - - [16, 500, 1, 512] - - [948, 453.197] + - [1022, 453.197] - - [10, 1024, 1, 100] - - [946, 166.334] + - [1020, 166.334] - - [8, 1024, 1, 100] - - [994, 140.374] + - [1068, 140.374] - - [64, 2000, 1, 500] - - [1089, 3940.99] + - [1163, 3940.99] - - [64, 1024, 1, 2000] - - [954, 3531.13] + - [1028, 3531.13] - - [10, 100, 1, 512] - - [948, 61.6385] + - [1022, 61.6385] - - [4, 500, 1, 2000] - - [974, 173.11] + - [1048, 173.11] - - [512, 1024, 1, 10] - - [1025, 736.46] + - [1099, 736.46] - - [128, 2048, 1, 2048] - - [1088, 4596.6] + - [1162, 4596.6] - - [4, 100, 1, 100] - - [937, 7.24286] + - [1011, 7.24286] - - [32, 1024, 1, 512] - - [997, 1519.78] + - [1071, 1519.78] - - [8, 512, 1, 2000] - - [1022, 356.894] + - [1096, 356.894] - - [100, 100, 1, 512] - - [962, 426.767] + - [1036, 426.767] - - [2, 2048, 1, 2048] - - [971, 170.878] + - [1045, 170.878] - - [2, 512, 1, 2000] - - [974, 90.8801] + - [1048, 90.8801] - - [16, 500, 1, 10] - - [947, 18.2818] + - [1021, 18.2818] - - [10, 500, 1, 100] - - [947, 88.1282] + - [1021, 88.1282] - - [4, 100, 1, 500] - - [1012, 23.6849] + - [1086, 23.6849] - - [512, 1024, 1, 1024] - - [1064, 7431.87] + - [1138, 7431.87] - - [64, 500, 1, 100] - - [957, 506.429] + - [1031, 506.429] - - [128, 2000, 1, 10] - - [1115, 432.532] + - [1189, 432.532] - - [10, 2000, 1, 2048] - - [978, 806.399] + - [1052, 806.399] - - [2, 100, 1, 100] - - [935, 3.225] + - [1009, 3.225] - - [10, 512, 1, 2000] - - [967, 462.194] + - [1041, 462.194] - - [8, 500, 1, 500] - - [948, 231.581] + - [1022, 231.581] - - [4, 500, 1, 512] - - [948, 118.619] + - [1022, 118.619] - - [10, 500, 1, 10] - - [942, 11.0649] + - [1016, 11.0649] - - [64, 512, 1, 2000] - - [948, 2116.9] + - [1022, 2116.9] - - [500, 512, 1, 10] - - [1069, 395.162] + - [1143, 395.162] - - [200, 512, 1, 512] - - [1087, 3449.36] + - [1161, 3449.36] - - [512, 500, 1, 500] - - [1043, 5536.43] + - [1117, 5536.43] - - [32, 512, 1, 2000] - - [958, 1264.3] + - [1032, 1264.3] - - [128, 500, 1, 2048] - - [1014, 3006.34] + - [1088, 3006.34] - - [500, 2048, 1, 10] - - [1039, 1049.28] + - [1113, 1049.28] - - [512, 512, 1, 100] - - [1076, 2664.16] + - [1150, 2664.16] - - [200, 2000, 1, 512] - - [1084, 5192.8] + - [1158, 5192.8] - - [500, 500, 1, 512] - - [1040, 5673.86] + - [1114, 5673.86] - - [128, 2048, 1, 500] - - [1072, 5251.38] + - [1146, 5251.38] - - [4, 512, 1, 512] - - [948, 123.753] + - [1022, 123.753] - - [16, 2048, 1, 2000] - - [964, 2294.78] + - [1038, 2294.78] - - [16, 500, 1, 1024] - - [948, 562.737] + - [1022, 562.737] - - [256, 2000, 1, 500] - - [1078, 6639.1] + - [1152, 6639.1] - - [10, 1024, 1, 10] - - [928, 21.0836] + - [1002, 21.0836] - - [16, 500, 1, 500] - - [948, 446.529] + - [1022, 446.529] - - [10, 2048, 1, 512] - - [946, 784.962] + - [1020, 784.962] - - [200, 500, 1, 10] - - [921, 176.156] + - [995, 176.156] - - [256, 2048, 1, 512] - - [1075, 6540.93] + - [1149, 6540.93] - - [256, 2000, 1, 2048] - - [1052, 6670.43] + - [1126, 6670.43] - - [500, 2048, 1, 500] - - [1080, 7264.57] + - [1154, 7264.57] - - [500, 100, 1, 1024] - - [1102, 2700.52] + - [1176, 2700.52] - - [16, 100, 1, 512] - - [1012, 96.7038] + - [1086, 96.7038] - - [64, 512, 1, 2048] - - [1013, 1868.39] + - [1087, 1868.39] - - [32, 1024, 1, 10] - - [924, 69.5237] + - [998, 69.5237] - - [16, 2048, 1, 512] - - [997, 1226.5] + - [1071, 1226.5] - - [8, 1024, 1, 512] - - [997, 416.202] + - [1071, 416.202] - - [4, 1024, 1, 2048] - - [1019, 223.201] + - [1093, 223.201] - - [100, 2048, 1, 2000] - - [1092, 5614.14] + - [1166, 5614.14] - - [512, 512, 1, 2048] - - [1057, 6868.97] + - [1131, 6868.97] - - [256, 2000, 1, 1024] - - [1048, 5758.98] + - [1122, 5758.98] - - [64, 512, 1, 512] - - [1117, 1651.4] + - [1191, 1651.4] - - [200, 1024, 1, 10] - - [931, 341.433] + - [1005, 341.433] - - [128, 500, 1, 500] - - [960, 2580.75] + - [1034, 2580.75] - - [100, 512, 1, 1024] - - [951, 2041.72] + - [1025, 2041.72] - - [16, 1024, 1, 500] - - [948, 867.897] + - [1022, 867.897] - - [128, 100, 1, 2048] - - [1118, 1011.46] + - [1192, 1011.46] - - [100, 512, 1, 500] - - [951, 2051.38] + - [1025, 2051.38] - - [8, 1024, 1, 1024] - - [964, 424.625] + - [1038, 424.625] - - [2, 2000, 1, 10] - - [990, 8.57458] + - [1064, 8.57458] - - [4, 500, 1, 10] - - [987, 4.56429] + - [1061, 4.56429] - - [500, 2000, 1, 2048] - - [1064, 7444.12] + - [1138, 7444.12] - - [4, 2000, 1, 100] - - [1000, 128.305] + - [1074, 128.305] - - [512, 2000, 1, 2000] - - [1050, 8454.53] + - [1124, 8454.53] - - [128, 500, 1, 10] - - [1109, 117.747] + - [1183, 117.747] - - [32, 1024, 1, 100] - - [957, 512.1] + - [1031, 512.1] - - [8, 500, 1, 2048] - - [972, 286.935] + - [1046, 286.935] - - [16, 1024, 1, 1024] - - [936, 881.256] + - [1010, 881.256] - - [200, 100, 1, 10] - - [1108, 40.4226] + - [1182, 40.4226] - - [512, 100, 1, 500] - - [1102, 1987.68] + - [1176, 1987.68] - - [512, 2048, 1, 2048] - - [1059, 8063.65] + - [1133, 8063.65] - - [16, 2000, 1, 512] - - [958, 1204.81] + - [1032, 1204.81] - - [64, 2048, 1, 1024] - - [956, 2853.37] + - [1030, 2853.37] - - [32, 2048, 1, 10] - - [930, 130.132] + - [1004, 130.132] - - [10, 2048, 1, 10] - - [932, 39.4846] + - [1006, 39.4846] - - [4, 2000, 1, 512] - - [948, 316.149] + - [1022, 316.149] - - [4, 500, 1, 100] - - [947, 35.8143] + - [1021, 35.8143] - - [8, 100, 1, 2048] - - [967, 84.7281] + - [1041, 84.7281] - - [512, 2048, 1, 10] - - [1047, 1225.07] + - [1121, 1225.07] - - [512, 100, 1, 10] - - [1036, 90.2408] + - [1110, 90.2408] - - [4, 512, 1, 1024] - - [948, 143.348] + - [1022, 143.348] - - [16, 2048, 1, 10] - - [981, 65.1159] + - [1055, 65.1159] - - [500, 2000, 1, 100] - - [1032, 4717.08] + - [1106, 4717.08] - - [32, 1024, 1, 2048] - - [975, 1582.86] + - [1049, 1582.86] - - [100, 2000, 1, 2000] - - [1092, 5512.78] + - [1166, 5512.78] - - [128, 100, 1, 512] - - [1118, 561.196] + - [1192, 561.196] - - [500, 500, 1, 100] - - [1072, 2460.73] + - [1146, 2460.73] - - [32, 2000, 1, 10] - - [924, 119.503] + - [998, 119.503] - - [128, 2048, 1, 100] - - [1072, 2708.2] + - [1146, 2708.2] - - [10, 2000, 1, 100] - - [947, 316.556] + - [1021, 316.556] - - [2, 2048, 1, 500] - - [958, 191.145] + - [1032, 191.145] - - [32, 1024, 1, 500] - - [958, 1563.46] + - [1032, 1563.46] - - [4, 1024, 1, 10] - - [987, 9.24286] + - [1061, 9.24286] - - [100, 512, 1, 10] - - [1113, 97.0697] + - [1187, 97.0697] - - [8, 100, 1, 100] - - [963, 14.3857] + - [1037, 14.3857] - - [128, 512, 1, 500] - - [951, 2677.22] + - [1025, 2677.22] - - [16, 100, 1, 2048] - - [974, 161.997] + - [1048, 161.997] - - [2, 1024, 1, 10] - - [987, 4.59123] + - [1061, 4.59123] - - [4, 100, 1, 2048] - - [967, 41.8959] + - [1041, 41.8959] - - [4, 512, 1, 2000] - - [967, 180.382] + - [1041, 180.382] - - [4096, 64, 1, 2048] - - [1167, 7247.28] + - [1241, 7247.28] - - [1024, 10080, 1, 1024] - - [1155, 9833.47] + - [1229, 9833.47] - - [1024, 1131, 1, 1024] - - [1133, 7551.95] + - [1207, 7551.95] - - [36548, 1216, 1, 1024] - - [1145, 10351.6] + - [1219, 10351.6] - - [1024, 29, 1, 1024] - - [1177, 1697.01] + - [1251, 1697.01] - - [1024, 2592, 1, 1024] - - [1146, 8424.11] + - [1220, 8424.11] - - [1024, 1568, 1, 1024] - - [1157, 7511.86] + - [1231, 7511.86] - - [4096, 91, 1, 2048] - - [1126, 5599.91] + - [1200, 5599.91] - - [1024, 4445, 1, 1024] - - [1144, 9261.22] + - [1218, 9261.22] - - [1024, 6272, 1, 1024] - - [1139, 9439.61] + - [1213, 9439.61] - - [36548, 3584, 1, 1024] - - [1138, 10393.8] + - [1212, 10393.8] - - [1024, 1827, 1, 1024] - - [1157, 8714.42] + - [1231, 8714.42] - - [1024, 3220, 1, 1024] - - [1137, 8861.2] + - [1211, 8861.2] - - [1024, 1856, 1, 1024] - - [1154, 8827.05] + - [1228, 8827.05] - - [1024, 1760, 1, 1024] - - [1154, 8334.2] + - [1228, 8334.2] - - [1024, 1600, 1, 1024] - - [1154, 7615.07] + - [1228, 7615.07] - - [1024, 1, 1, 21] - - [1158, 0.1] + - [1232, 0.1] - - [36548, 4235, 1, 1024] - - [1138, 10276.8] + - [1212, 10276.8] - - [1024, 49, 1, 1024] - - [1173, 2643.12] + - [1247, 2643.12] - - [1024, 1984, 1, 1024] - - [1157, 9449.52] + - [1231, 9449.52] - - [1024, 14720, 1, 1024] - - [1144, 10033.3] + - [1218, 10033.3] - - [1024, 1152, 1, 1024] - - [1127, 7523.54] + - [1201, 7523.54] - - [36548, 14976, 1, 1024] - - [1145, 10421.7] + - [1219, 10421.7] - - [36548, 1152, 1, 1024] - - [1145, 10258.1] + - [1219, 10258.1] - - [4096, 86, 1, 3072] - - [1126, 5308.85] + - [1200, 5308.85] - - [1024, 3392, 1, 1024] - - [1139, 9176.54] + - [1213, 9176.54] - - [1024, 1408, 1, 1024] - - [1139, 8958.83] + - [1213, 8958.83] - - [1024, 2080, 1, 1024] - - [1130, 8396.49] + - [1204, 8396.49] - - [1024, 1824, 1, 1024] - - [1148, 8671.71] + - [1222, 8671.71] - - [36548, 2432, 1, 1024] - - [1138, 10392.6] + - [1212, 10392.6] - - [4096, 29, 1, 2048] - - [1159, 4325.66] + - [1233, 4325.66] - - [1024, 1102, 1, 1024] - - [1133, 7204.18] + - [1207, 7204.18] - - [4096, 49, 1, 2048] - - [1165, 5609.29] + - [1239, 5609.29] - - [36548, 1827, 1, 1024] - - [1145, 10183.2] + - [1219, 10183.2] - - [4096, 25, 1, 2048] - - [1160, 3788.31] + - [1234, 3788.31] - - [1024, 10176, 1, 1024] - - [1155, 9941.18] + - [1229, 9941.18] - - [1024, 774, 1, 1024] - - [1140, 7079.67] + - [1214, 7079.67] - - [1024, 1952, 1, 1024] - - [1157, 9300.49] + - [1231, 9300.49] - - [4096, 128, 1, 2048] - - [1127, 8274.96] + - [1201, 8274.96] - - [1024, 17024, 1, 1024] - - [1137, 9960.72] + - [1211, 9960.72] - - [1024, 1472, 1, 1024] - - [1146, 9343.37] + - [1220, 9343.37] - - [36548, 4459, 1, 1024] - - [1138, 10358.1] + - [1212, 10358.1] - - [4096, 91, 1, 3072] - - [1132, 5509.39] + - [1206, 5509.39] - - [1024, 3712, 1, 1024] - - [1146, 9048.66] + - [1220, 9048.66] - - [4096, 64, 1, 3072] - - [1179, 7489.93] + - [1253, 7489.93] - - [4096, 29, 1, 3072] - - [1159, 4511.78] + - [1233, 4511.78] - - [4096, 128, 1, 3072] - - [1126, 8423.83] + - [1200, 8423.83] - - [36548, 12928, 1, 1024] - - [1145, 10426.1] + - [1219, 10426.1] - - [1024, 1632, 1, 1024] - - [1127, 7761.73] + - [1201, 7761.73] - - [1024, 1696, 1, 1024] - - [1152, 8107.29] + - [1226, 8107.29] - - [4096, 24, 1, 2048] - - [1159, 3663.25] + - [1233, 3663.25] - - [4096, 63, 1, 3072] - - [1168, 7175.37] + - [1242, 7175.37] - - [4096, 96, 1, 2048] - - [1127, 5866.28] + - [1201, 5866.28] - - [36548, 1764, 1, 1024] - - [1138, 10128.5] + - [1212, 10128.5] - - [4096, 32, 1, 2048] - - [1163, 4540.62] + - [1237, 4540.62] - - [1024, 35, 1, 1024] - - [1171, 1911.57] + - [1245, 1911.57] - - [1024, 1120, 1, 1024] - - [1126, 7289.13] + - [1200, 7289.13] - - [4096, 49, 1, 3072] - - [1165, 5751.62] + - [1239, 5751.62] - - [1024, 24, 1, 1024] - - [1171, 1392.02] + - [1245, 1392.02] - - [1024, 2944, 1, 1024] - - [1147, 9284.93] + - [1221, 9284.93] - - [36548, 14080, 1, 1024] - - [1138, 10441.4] + - [1212, 10441.4] - - [1024, 1, 1, 1024] - - [1158, 0.1] + - [1232, 0.1] - - [1024, 1280, 1, 1024] - - [1126, 8244.46] + - [1200, 8244.46] - - [1024, 13440, 1, 1024] - - [1138, 9799.92] + - [1212, 9799.92] - - [1024, 1015, 1, 1024] - - [1146, 9187.85] + - [1220, 9187.85] - - [36548, 9120, 1, 1024] - - [1138, 10400.0] + - [1212, 10400.0] - - [36548, 1, 1, 1024] - - [1158, 0.1] + - [1232, 0.1] - - [1024, 3008, 1, 1024] - - [1147, 9468.55] + - [1221, 9468.55] - - [1024, 2560, 1, 1024] - - [1144, 8879.31] + - [1218, 8879.31] - - [1024, 21, 1, 1024] - - [1170, 1234.41] + - [1244, 1234.41] - - [1024, 2208, 1, 1024] - - [1126, 8231.27] + - [1200, 8231.27] - - [1024, 96, 1, 1024] - - [1176, 3767.44] + - [1250, 3767.44] - - [4096, 86, 1, 2048] - - [1127, 5529.09] + - [1201, 5529.09] - - [4096, 96, 1, 3072] - - [1126, 6273.28] + - [1200, 6273.28] - - [1024, 1920, 1, 1024] - - [1156, 9118.19] + - [1230, 9118.19] - - [4096, 27, 1, 2048] - - [1159, 4073.7] + - [1233, 4073.7] - - [36548, 2496, 1, 1024] - - [1138, 10361.2] + - [1212, 10361.2] - - [1024, 1, 1, 14] - - [1158, 0.1] + - [1232, 0.1] - - [1024, 91, 1, 1024] - - [1178, 3647.67] + - [1252, 3647.67] - - [1024, 2016, 1, 1024] - - [1154, 9560.24] + - [1228, 9560.24] - - [1024, 1184, 1, 1024] - - [1127, 7678.96] + - [1201, 7678.96] - - [4096, 1, 1, 2048] - - [1158, 0.1] + - [1232, 0.1] - - [1024, 1664, 1, 1024] - - [1152, 7934.07] + - [1226, 7934.07] - - [1024, 11424, 1, 1024] - - [1144, 9777.91] + - [1218, 9777.91] - - [4096, 24, 1, 3072] - - [1162, 3813.1] + - [1236, 3813.1] - - [1024, 1216, 1, 1024] - - [1126, 7902.13] + - [1200, 7902.13] - - [36548, 3185, 1, 1024] - - [1138, 10336.7] + - [1212, 10336.7] - - [36548, 9216, 1, 1024] - - [1138, 10414.3] + - [1212, 10414.3] - - [1024, 3200, 1, 1024] - - [1144, 8847.01] + - [1218, 8847.01] - - [1024, 2656, 1, 1024] - - [1139, 8649.25] + - [1213, 8649.25] - - [1024, 2368, 1, 1024] - - [1139, 8873.16] + - [1213, 8873.16] - - [1024, 4459, 1, 1024] - - [1146, 9431.32] + - [1220, 9431.32] - - [1024, 3808, 1, 1024] - - [1146, 9263.72] + - [1220, 9263.72] - - [1024, 2336, 1, 1024] - - [1139, 8966.0] + - [1213, 8966.0] - - [4096, 27, 1, 3072] - - [1159, 4171.74] + - [1233, 4171.74] - - [1024, 2304, 1, 1024] - - [1136, 8601.38] + - [1210, 8601.38] - - [1024, 1560, 1, 1024] - - [1151, 7481.74] + - [1225, 7481.74] - - [4096, 35, 1, 3072] - - [1165, 4176.9] + - [1239, 4176.9] - - [1024, 2496, 1, 1024] - - [1142, 9092.86] + - [1216, 9092.86] - - [1024, 1504, 1, 1024] - - [1142, 9220.53] + - [1216, 9220.53] - - [4096, 50, 1, 2048] - - [1166, 5472.83] + - [1240, 5472.83] - - [1024, 3232, 1, 1024] - - [1139, 8961.94] + - [1213, 8961.94] - - [1024, 14, 1, 1024] - - [1170, 882.315] + - [1244, 882.315] - - [36548, 1015, 1, 1024] - - [1138, 10140.9] + - [1212, 10140.9] - - [1024, 2000, 1, 1024] - - [1150, 9487.8] + - [1224, 9487.8] - - [36548, 243, 1, 1024] - - [1143, 9441.12] + - [1217, 9441.12] - - [36548, 32, 1, 1024] - - [1131, 4721.05] + - [1205, 4721.05] - - [1024, 25, 1, 1024] - - [1177, 1462.96] + - [1251, 1462.96] - - [1024, 13184, 1, 1024] - - [1141, 9866.28] + - [1215, 9866.28] - - [1024, 2688, 1, 1024] - - [1136, 8559.93] + - [1210, 8559.93] - - [1024, 27, 1, 1024] - - [1175, 1559.11] + - [1249, 1559.11] - - [36548, 950, 1, 1024] - - [1145, 10053.6] + - [1219, 10053.6] - - [1024, 1764, 1, 1024] - - [1152, 8347.11] + - [1226, 8347.11] - - [1024, 992, 1, 1024] - - [1139, 9035.82] + - [1213, 9035.82] - - [1024, 1376, 1, 1024] - - [1139, 8797.96] + - [1213, 8797.96] - - [1024, 950, 1, 1024] - - [1146, 8635.26] + - [1220, 8635.26] - - [36548, 774, 1, 1024] - - [1138, 9460.82] + - [1212, 9460.82] - - [36548, 25, 1, 1024] - - [1131, 3694.16] + - [1205, 3694.16] - - [1024, 4256, 1, 1024] - - [1139, 9172.16] + - [1213, 9172.16] - - [4096, 32, 1, 3072] - - [1160, 4886.67] + - [1234, 4886.67] - - [1024, 243, 1, 1024] - - [1164, 6594.41] + - [1238, 6594.41] - - [36548, 3712, 1, 1024] - - [1138, 10401.6] + - [1212, 10401.6] - - [1024, 50, 1, 1024] - - [1173, 2742.19] + - [1247, 2742.19] - - [1024, 3360, 1, 1024] - - [1135, 9017.37] + - [1209, 9017.37] - - [1024, 2048, 1, 1024] - - [1150, 9736.65] + - [1224, 9736.65] - - [1024, 2784, 1, 1024] - - [1146, 8835.6] + - [1220, 8835.6] - - [1024, 4992, 1, 1024] - - [1144, 9639.38] + - [1218, 9639.38] - - [36548, 1102, 1, 1024] - - [1145, 9859.04] + - [1219, 9859.04] - - [1024, 1536, 1, 1024] - - [1137, 9294.98] + - [1211, 9294.98] - - [1024, 2720, 1, 1024] - - [1142, 8617.88] + - [1216, 8617.88] - - [4096, 1, 1, 3072] - - [1158, 0.1] + - [1232, 0.1] - - [1024, 2752, 1, 1024] - - [1146, 8902.17] + - [1220, 8902.17] - - [1024, 2816, 1, 1024] - - [1144, 8906.95] + - [1218, 8906.95] - - [1024, 2624, 1, 1024] - - [1146, 8494.41] + - [1220, 8494.41] - - [1024, 2144, 1, 1024] - - [1129, 8243.56] + - [1203, 8243.56] - - [36548, 1131, 1, 1024] - - [1145, 10104.6] + - [1219, 10104.6] - - [4096, 25, 1, 3072] - - [1160, 3959.98] + - [1234, 3959.98] - - [1024, 64, 1, 1024] - - [1173, 3410.1] + - [1247, 3410.1] - - [1024, 3296, 1, 1024] - - [1144, 9066.52] + - [1218, 9066.52] - - [36548, 4992, 1, 1024] - - [1138, 10395.6] + - [1212, 10395.6] - - [1024, 1344, 1, 1024] - - [1139, 8522.66] + - [1213, 8522.66] - - [36548, 2401, 1, 1024] - - [1138, 10250.3] + - [1212, 10250.3] - - [1024, 15744, 1, 1024] - - [1138, 10006.4] + - [1212, 10006.4] - - [1024, 15232, 1, 1024] - - [1137, 9912.21] + - [1211, 9912.21] - - [1024, 1888, 1, 1024] - - [1149, 8962.98] + - [1223, 8962.98] - - [1024, 1792, 1, 1024] - - [1153, 8556.82] + - [1227, 8556.82] - - [36548, 1073, 1, 1024] - - [1138, 10161.2] + - [1212, 10161.2] - - [4096, 50, 1, 3072] - - [1165, 5882.16] + - [1239, 5882.16] - - [36548, 15488, 1, 1024] - - [1145, 10437.1] + - [1219, 10437.1] - - [1024, 2464, 1, 1024] - - [1142, 8880.02] + - [1216, 8880.02] - - [1024, 2272, 1, 1024] - - [1139, 8720.35] + - [1213, 8720.35] - - [1024, 13, 1, 1024] - - [1169, 774.616] + - [1243, 774.616] - - [1024, 2432, 1, 1024] - - [1144, 8491.53] + - [1218, 8491.53] - - [36548, 24, 1, 1024] - - [1131, 3564.41] + - [1205, 3564.41] - - [1024, 3936, 1, 1024] - - [1154, 9433.3] + - [1228, 9433.3] - - [36548, 13824, 1, 1024] - - [1138, 10439.8] + - [1212, 10439.8] - - [1024, 2401, 1, 1024] - - [1146, 8870.03] + - [1220, 8870.03] - - [1024, 32, 1, 1024] - - [1161, 1839.71] + - [1235, 1839.71] - - [1024, 2176, 1, 1024] - - [1130, 8544.55] + - [1204, 8544.55] - - [1024, 2240, 1, 1024] - - [1139, 8381.55] + - [1213, 8381.55] - - [1024, 1728, 1, 1024] - - [1127, 8212.33] + - [1201, 8212.33] - - [1024, 128, 1, 1024] - - [1174, 4660.44] + - [1248, 4660.44] - - [1024, 216, 1, 1024] - - [1164, 5777.97] + - [1238, 5777.97] - - [1024, 63, 1, 1024] - - [1172, 3329.75] + - [1246, 3329.75] - - [1024, 86, 1, 1024] - - [1178, 3533.7] + - [1252, 3533.7] - - [1024, 2528, 1, 1024] - - [1134, 8789.25] + - [1208, 8789.25] - - [1024, 2400, 1, 1024] - - [1139, 8939.4] + - [1213, 8939.4] - - [1024, 1440, 1, 1024] - - [1146, 9131.41] + - [1220, 9131.41] - - [1024, 2912, 1, 1024] - - [1139, 9140.03] + - [1213, 9140.03] - - [4096, 35, 1, 2048] - - [1165, 4059.85] + - [1239, 4059.85] - - [4096, 63, 1, 2048] - - [1167, 6946.5] + - [1241, 6946.5] - - [1024, 2880, 1, 1024] - - [1137, 9104.98] + - [1211, 9104.98] - - [1024, 4064, 1, 1024] - - [1156, 9715.2] + - [1230, 9715.2] - - [1024, 4655, 1, 1024] - - [1144, 9033.9] + - [1218, 9033.9] - - [1024, 1088, 1, 1024] - - [1128, 8144.41] + - [1202, 8144.41] - - [36548, 6272, 1, 1024] - - [1145, 10427.4] + - [1219, 10427.4] - - [1024, 1, 1, 13] - - [1158, 0.1] + - [1232, 0.1] - - [768, 512, 1, 768] - - [1182, 5889.14] + - [1256, 5889.14] - - [768, 2048, 1, 3072] - - [1192, 9394.72] + - [1266, 9394.72] - - [768, 32, 1, 768] - - [1204, 1502.84] + - [1278, 1502.84] - - [64, 128, 96, 128] - - [1199, 4973.58] + - [1273, 4973.58] - - [3072, 1024, 1, 768] - - [1193, 9856.17] + - [1267, 9856.17] - - [768, 1024, 1, 3072] - - [1186, 8611.16] + - [1260, 8611.16] - - [768, 512, 1, 3072] - - [1185, 6430.89] + - [1259, 6430.89] - - [768, 64, 1, 768] - - [1206, 2621.54] + - [1280, 2621.54] - - [768, 4096, 1, 3072] - - [1191, 10030.5] + - [1265, 10030.5] - - [768, 2048, 1, 2] - - [1184, 381.863] + - [1258, 381.863] - - [768, 2048, 1, 768] - - [1189, 9754.3] + - [1263, 9754.3] - - [768, 320, 1, 30522] - - [1202, 8529.5] + - [1276, 8529.5] - - [64, 64, 96, 64] - - [1196, 2496.71] + - [1270, 2496.71] - - [768, 640, 1, 30522] - - [1183, 8253.94] + - [1257, 8253.94] - - [768, 1280, 1, 30522] - - [1188, 9572.95] + - [1262, 9572.95] - - [768, 1280, 1, 768] - - [1192, 8714.03] + - [1266, 8714.03] - - [768, 640, 1, 768] - - [1182, 7293.13] + - [1256, 7293.13] - - [768, 32, 1, 2] - - [1194, 11.9154] + - [1268, 11.9154] - - [3072, 2048, 1, 768] - - [1189, 10019.7] + - [1263, 10019.7] - - [768, 4096, 1, 768] - - [1189, 9927.45] + - [1263, 9927.45] - - [3072, 4096, 1, 768] - - [1192, 10150.2] + - [1266, 10150.2] - - [64, 256, 192, 256] - - [1198, 7054.29] + - [1272, 7054.29] - - [768, 8, 1, 768] - - [1205, 341.039] + - [1279, 341.039] - - [64, 128, 384, 128] - - [1197, 6765.11] + - [1271, 6765.11] - - [768, 1024, 1, 768] - - [1187, 8768.68] + - [1261, 8768.68] - - [768, 320, 1, 768] - - [1203, 6838.64] + - [1277, 6838.64] - - [64, 64, 768, 64] - - [1200, 5388.93] + - [1274, 5388.93] - - [768, 1024, 1, 2] - - [1180, 258.795] + - [1254, 258.795] - - [768, 16, 1, 768] - - [1205, 819.3] + - [1279, 819.3] - - [64, 256, 96, 256] - - [1198, 5893.74] + - [1272, 5893.74] - - [3072, 512, 1, 768] - - [1190, 9722.89] + - [1264, 9722.89] - - [768, 160, 1, 768] - - [1207, 5019.88] + - [1281, 5019.88] - - [768, 4096, 1, 2] - - [1181, 507.475] + - [1255, 507.475] - - [1600, 512, 1, 1024] - - [1211, 7187.05] + - [1285, 7187.05] - - [1024, 512, 1, 64] - - [1209, 2557.6] + - [1283, 2557.6] - - [1024, 512, 1, 1] - - [1208, 71.3348] + - [1282, 71.3348] - - [2048, 512, 1, 1] - - [1210, 90.4945] + - [1284, 90.4945] - - [1024, 200, 1, 1] - - [1216, 40.1] + - [1290, 40.1] - - [32, 200, 1, 1] - - [1212, 1.66863] + - [1286, 1.66863] - - [560, 200, 1, 1024] - - [1220, 4731.45] + - [1294, 4731.45] - - [1, 512, 1, 1] - - [1219, 0.230612] + - [1293, 0.230612] - - [64, 512, 1, 1] - - [1214, 7.68519] + - [1288, 7.68519] - - [1024, 8192, 1, 256] - - [1229, 9519.09] + - [1303, 9519.09] - - [1024, 22016, 1, 256] - - [1235, 9881.22] + - [1309, 9881.22] - - [256, 8976, 1, 4352] - - [1227, 9567.18] + - [1301, 9567.18] - - [512, 256, 1, 2048] - - [1240, 5917.99] + - [1314, 5917.99] - - [1024, 19968, 1, 256] - - [1235, 9882.47] + - [1309, 9882.47] - - [256, 8976, 1, 1536] - - [1225, 8437.45] + - [1299, 8437.45] - - [256, 8976, 1, 33536] - - [1225, 8441.99] + - [1299, 8441.99] - - [1024, 1792, 1, 256] - - [1225, 7757.07] + - [1299, 7757.07] - - [1024, 21504, 1, 256] - - [1235, 9894.0] + - [1309, 9894.0] - - [512, 215, 1, 2048] - - [1241, 4665.74] + - [1315, 4665.74] - - [1024, 7168, 1, 256] - - [1229, 9509.45] + - [1303, 9509.45] - - [256, 8976, 1, 15872] - - [1231, 8914.75] + - [1305, 8914.75] - - [1024, 19712, 1, 256] - - [1235, 9772.0] + - [1309, 9772.0] - - [256, 8976, 1, 5632] - - [1231, 8740.13] + - [1305, 8740.13] - - [1024, 14848, 1, 256] - - [1235, 9756.25] + - [1309, 9756.25] - - [1024, 28672, 1, 256] - - [1235, 9959.02] + - [1309, 9959.02] - - [256, 8976, 1, 9728] - - [1238, 8853.14] + - [1312, 8853.14] - - [1024, 17152, 1, 256] - - [1229, 9737.4] + - [1303, 9737.4] - - [256, 8976, 1, 11520] - - [1231, 8999.3] + - [1305, 8999.3] - - [256, 8976, 1, 8192] - - [1221, 7897.42] + - [1295, 7897.42] - - [1024, 3328, 1, 256] - - [1236, 8593.63] + - [1310, 8593.63] - - [256, 8976, 1, 7424] - - [1231, 8980.57] + - [1305, 8980.57] - - [1024, 18944, 1, 256] - - [1235, 9854.95] + - [1309, 9854.95] - - [1024, 10496, 1, 256] - - [1230, 9454.0] + - [1304, 9454.0] - - [256, 8976, 1, 5376] - - [1228, 9608.47] + - [1302, 9608.47] - - [256, 8976, 1, 6144] - - [1225, 7880.23] + - [1299, 7880.23] - - [1024, 40448, 1, 256] - - [1235, 10016.7] + - [1309, 10016.7] - - [256, 8976, 1, 22016] - - [1238, 8939.97] + - [1312, 8939.97] - - [256, 8976, 1, 4864] - - [1226, 9211.53] + - [1300, 9211.53] - - [256, 8976, 1, 12288] - - [1222, 8065.15] + - [1296, 8065.15] - - [1024, 9728, 1, 256] - - [1235, 9636.35] + - [1309, 9636.35] - - [256, 8976, 1, 2048] - - [1223, 7001.43] + - [1297, 7001.43] - - [1024, 10240, 1, 256] - - [1229, 9620.06] + - [1303, 9620.06] - - [256, 8976, 1, 2304] - - [1227, 9509.84] + - [1301, 9509.84] - - [1024, 7936, 1, 256] - - [1235, 9300.77] + - [1309, 9300.77] - - [768, 256, 1, 2048] - - [1239, 6268.05] + - [1313, 6268.05] - - [1024, 9984, 1, 256] - - [1235, 9477.38] + - [1309, 9477.38] - - [1024, 13312, 1, 256] - - [1235, 9758.66] + - [1309, 9758.66] - - [1024, 16128, 1, 256] - - [1229, 9722.0] + - [1303, 9722.0] - - [1024, 8960, 1, 256] - - [1230, 9398.35] + - [1304, 9398.35] - - [1024, 5120, 1, 256] - - [1236, 9315.6] + - [1310, 9315.6] - - [1024, 11264, 1, 256] - - [1229, 9664.9] + - [1303, 9664.9] - - [256, 8976, 1, 20480] - - [1237, 8279.97] + - [1311, 8279.97] - - [1024, 20992, 1, 256] - - [1229, 9878.97] + - [1303, 9878.97] - - [256, 8976, 1, 9472] - - [1231, 8991.06] + - [1305, 8991.06] - - [256, 8976, 1, 8448] - - [1231, 8983.62] + - [1305, 8983.62] - - [256, 8976, 1, 20992] - - [1232, 8942.21] + - [1306, 8942.21] - - [256, 8976, 1, 10496] - - [1232, 8989.81] + - [1306, 8989.81] - - [1024, 15104, 1, 256] - - [1230, 9676.11] + - [1304, 9676.11] - - [1024, 6400, 1, 256] - - [1238, 9145.99] + - [1312, 9145.99] - - [1024, 4096, 1, 256] - - [1231, 9124.35] + - [1305, 9124.35] - - [256, 8976, 1, 2560] - - [1225, 8566.21] + - [1299, 8566.21] - - [256, 8976, 1, 2816] - - [1227, 9496.94] + - [1301, 9496.94] - - [1024, 7680, 1, 256] - - [1235, 9460.94] + - [1309, 9460.94] - - [256, 8976, 1, 14336] - - [1232, 8226.9] + - [1306, 8226.9] - - [256, 8976, 1, 6656] - - [1232, 8771.52] + - [1306, 8771.52] - - [1024, 3072, 1, 256] - - [1232, 9077.04] + - [1306, 9077.04] - - [256, 8976, 1, 5888] - - [1228, 9546.4] + - [1302, 9546.4] - - [1024, 12288, 1, 256] - - [1229, 9690.91] + - [1303, 9690.91] - - [256, 8976, 1, 26112] - - [1234, 8699.93] + - [1308, 8699.93] - - [1024, 7424, 1, 256] - - [1236, 9256.94] + - [1310, 9256.94] - - [256, 8976, 1, 14848] - - [1237, 8885.89] + - [1311, 8885.89] - - [768, 215, 1, 2048] - - [1239, 5628.69] + - [1313, 5628.69] - - [1024, 2560, 1, 256] - - [1232, 8820.93] + - [1306, 8820.93] - - [256, 8976, 1, 19968] - - [1231, 8928.96] + - [1305, 8928.96] - - [256, 8976, 1, 9984] - - [1231, 8993.22] + - [1305, 8993.22] - - [1024, 4864, 1, 256] - - [1232, 8974.4] + - [1306, 8974.4] - - [1024, 33536, 1, 256] - - [1235, 9943.17] + - [1309, 9943.17] - - [256, 8976, 1, 15104] - - [1232, 8996.73] + - [1306, 8996.73] - - [1024, 2048, 1, 256] - - [1230, 8462.76] + - [1304, 8462.76] - - [256, 8976, 1, 8960] - - [1232, 8999.02] + - [1306, 8999.02] - - [1024, 6144, 1, 256] - - [1237, 9359.77] + - [1311, 9359.77] - - [1024, 14592, 1, 256] - - [1235, 9667.52] + - [1309, 9667.52] - - [256, 8976, 1, 19712] - - [1231, 9020.21] + - [1305, 9020.21] - - [1024, 11520, 1, 256] - - [1230, 9527.8] + - [1304, 9527.8] - - [1024, 5632, 1, 256] - - [1229, 9297.3] + - [1303, 9297.3] - - [256, 8976, 1, 11008] - - [1238, 8994.9] + - [1312, 8994.9] - - [256, 8976, 1, 17152] - - [1232, 9003.9] + - [1306, 9003.9] - - [256, 8976, 1, 3072] - - [1221, 8262.06] + - [1295, 8262.06] - - [1024, 3840, 1, 256] - - [1238, 8671.99] + - [1312, 8671.99] - - [1024, 14336, 1, 256] - - [1235, 9760.38] + - [1309, 9760.38] - - [1024, 20480, 1, 256] - - [1229, 9887.95] + - [1303, 9887.95] - - [1024, 23552, 1, 256] - - [1229, 9890.56] + - [1303, 9890.56] - - [256, 8976, 1, 7168] - - [1224, 8478.44] + - [1298, 8478.44] - - [1024, 13568, 1, 256] - - [1229, 9654.74] + - [1303, 9654.74] - - [1024, 4608, 1, 256] - - [1237, 9218.35] + - [1311, 9218.35] - - [256, 8976, 1, 10240] - - [1222, 8076.26] + - [1296, 8076.26] - - [1024, 8704, 1, 256] - - [1231, 9475.6] + - [1305, 9475.6] - - [1024, 11008, 1, 256] - - [1235, 9525.06] + - [1309, 9525.06] - - [1024, 8448, 1, 256] - - [1229, 9352.26] + - [1303, 9352.26] - - [256, 8976, 1, 44505] - - [1233, 8430.33] + - [1307, 8430.33] + - - [6272, 256, 1, 528] + - [1359, 7390.04] + - - [3136, 2048, 1, 1024] + - [1340, 9658.04] + - - [6272, 112, 1, 512] + - [1338, 5931.19] + - - [2048, 320, 1, 1280] + - [1358, 7773.09] + - - [289, 256, 1, 1568] + - [1379, 3718.27] + - - [3136, 64, 64, 64] + - [1318, 8201.25] + - - [50176, 128, 1, 256] + - [1341, 8908.68] + - - [5329, 64, 1, 448] + - [1324, 4602.3] + - - [289, 192, 1, 1344] + - [1376, 3452.69] + - - [12544, 1024, 1, 256] + - [1341, 9742.74] + - - [784, 64, 32, 192] + - [1317, 6844.71] + - - [6272, 64, 1, 480] + - [1325, 5562.34] + - - [196, 128, 1, 800] + - [1367, 1639.84] + - - [64, 512, 1, 1344] + - [1366, 2313.14] + - - [6272, 64, 1, 512] + - [1324, 5609.29] + - - [6272, 160, 1, 528] + - [1325, 6149.8] + - - [289, 160, 32, 768] + - [1352, 6637.92] + - - [12544, 256, 1, 1024] + - [1359, 8790.56] + - - [289, 224, 1, 1568] + - [1379, 3270.27] + - - [5329, 64, 32, 160] + - [1332, 9091.14] + - - [5329, 96, 1, 576] + - [1359, 5555.76] + - - [3025, 64, 1, 363] + - [1377, 4392.4] + - - [784, 32, 32, 192] + - [1348, 5633.9] + - - [3136, 512, 1, 1024] + - [1344, 7553.24] + - - [6272, 16, 1, 480] + - [1379, 3219.95] + - - [1225, 64, 32, 288] + - [1339, 8240.68] + - - [64, 256, 1, 1536] + - [1372, 1456.46] + - - [289, 192, 32, 768] + - [1351, 7372.9] + - - [2048, 448, 1, 1280] + - [1334, 8403.11] + - - [3136, 2048, 1, 512] + - [1333, 9486.41] + - - [289, 256, 1, 2016] + - [1379, 3876.18] + - - [289, 384, 32, 1024] + - [1318, 7350.64] + - - [1568, 32, 1, 832] + - [1368, 2717.97] + - - [3136, 64, 32, 64] + - [1321, 7657.36] + - - [289, 160, 1, 1120] + - [1375, 2827.0] + - - [6272, 128, 1, 528] + - [1329, 6926.36] + - - [21609, 32, 1, 288] + - [1330, 3699.0] + - - [1225, 192, 1, 1728] + - [1363, 7309.91] + - - [4096, 512, 1, 4096] + - [1346, 10272.2] + - - [64, 256, 1, 1152] + - [1372, 1387.92] + - - [6272, 96, 1, 480] + - [1360, 6371.66] + - - [784, 96, 1, 800] + - [1380, 3330.37] + - - [2048, 448, 1, 2048] + - [1334, 8622.75] + - - [784, 96, 32, 192] + - [1349, 7092.46] + - - [3136, 64, 64, 256] + - [1342, 9579.26] + - - [289, 224, 1, 1344] + - [1379, 3180.11] + - - [1001, 512, 1, 4096] + - [1320, 8195.17] + - - [2048, 192, 1, 1280] + - [1325, 6120.19] + - - [1225, 64, 32, 256] + - [1330, 8076.72] + - - [2048, 256, 1, 1536] + - [1320, 8137.8] + - - [1225, 64, 1, 1200] + - [1379, 3552.97] + - - [6272, 128, 1, 512] + - [1333, 6878.31] + - - [729, 192, 1, 1600] + - [1378, 5016.87] + - - [289, 192, 1, 896] + - [1376, 3091.97] + - - [1568, 384, 1, 832] + - [1359, 6934.72] + - - [784, 16, 32, 192] + - [1350, 3380.38] + - - [1568, 256, 1, 832] + - [1324, 5980.96] + - - [1568, 48, 1, 832] + - [1381, 3275.19] + - - [1568, 192, 1, 832] + - [1319, 4441.21] + - - [289, 192, 32, 1024] + - [1322, 6563.16] + - - [6272, 32, 1, 528] + - [1363, 4998.77] + - - [49, 128, 1, 1200] + - [1364, 550.275] + - - [1225, 64, 32, 384] + - [1336, 8589.43] + - - [289, 128, 1, 896] + - [1375, 2103.2] + - - [1568, 160, 1, 832] + - [1363, 6995.15] + - - [1001, 32, 1, 1024] + - [1372, 1744.82] + - - [2048, 320, 1, 2048] + - [1357, 7118.14] + - - [2048, 384, 1, 1536] + - [1320, 8184.11] + - - [50176, 512, 1, 256] + - [1332, 9852.5] + - - [289, 256, 1, 1792] + - [1381, 3809.85] + - - [64, 448, 1, 1152] + - [1373, 2128.33] + - - [5041, 96, 1, 576] + - [1358, 5279.4] + - - [6272, 192, 1, 480] + - [1320, 7479.75] + - - [784, 32, 32, 256] + - [1347, 5709.01] + - - [1001, 32, 1, 2048] + - [1374, 2141.14] + - - [289, 192, 1, 1120] + - [1370, 3277.87] + - - [6272, 32, 1, 512] + - [1362, 4978.8] + - - [289, 384, 1, 3456] + - [1379, 5904.24] + - - [289, 384, 1, 2592] + - [1380, 5707.44] + - - [784, 128, 64, 512] + - [1326, 8864.49] + - - [12544, 1024, 1, 512] + - [1341, 10008.4] + - - [12544, 256, 1, 512] + - [1359, 8628.18] + - - [6272, 24, 1, 512] + - [1363, 3568.17] + - - [5041, 192, 1, 720] + - [1334, 8424.52] + - - [64, 320, 1, 1728] + - [1367, 1469.76] + - - [784, 128, 32, 256] + - [1335, 8104.24] + - - [289, 96, 1, 864] + - [1373, 1838.35] + - - [1225, 32, 32, 192] + - [1354, 5949.82] + - - [1568, 128, 1, 832] + - [1362, 5718.79] + - - [289, 128, 32, 768] + - [1320, 7289.35] + - - [3136, 256, 64, 64] + - [1328, 9104.02] + - - [196, 64, 1, 800] + - [1366, 915.72] + - - [4096, 512, 1, 9216] + - [1343, 10351.5] + - - [12544, 64, 1, 147] + - [1333, 5069.43] + - - [784, 32, 1, 400] + - [1364, 1140.46] + - - [6272, 160, 1, 512] + - [1324, 6140.18] + - - [1225, 48, 32, 288] + - [1330, 5978.71] + - - [64, 320, 1, 2880] + - [1371, 1920.1] + - - [1225, 64, 32, 192] + - [1324, 7641.11] + - - [1001, 32, 1, 1536] + - [1372, 2084.89] + - - [784, 64, 32, 256] + - [1316, 6990.61] + - - [64, 384, 1, 1152] + - [1373, 1862.7] + - - [784, 512, 64, 128] + - [1327, 9026.05] + - - [3136, 512, 1, 2048] + - [1345, 7764.4] + - - [6272, 144, 1, 512] + - [1320, 5574.14] + - - [1225, 192, 32, 384] + - [1334, 9373.93] + - - [64, 192, 1, 1728] + - [1372, 1206.56] + - - [8192, 320, 1, 1280] + - [1386, 9876.02] + - - [8192, 320, 1, 2048] + - [1389, 9745.8] + - - [8192, 384, 1, 1280] + - [1386, 10046.3] + - - [8192, 192, 1, 1280] + - [1389, 9951.0] + - - [8192, 192, 1, 2048] + - [1385, 9559.77] + - - [8192, 384, 1, 2048] + - [1387, 9945.84] + - - [8192, 448, 1, 2048] + - [1388, 9908.61] + - - [1001, 64, 1, 1536] + - [1382, 3650.04] + - - [8192, 448, 1, 1280] + - [1386, 9981.45] + - - [1001, 64, 1, 2048] + - [1383, 3580.97] + - - [1001, 128, 1, 2048] + - [1384, 5587.97] - null diff --git a/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Alik_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Alik_Bljk_SB.yaml index eb99e9a3c..a78fe0364 100644 --- a/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Alik_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Alik_Bljk_SB.yaml @@ -65675,24 +65675,24 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -65700,32 +65700,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2304 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -65734,36 +65739,46 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65773,6 +65788,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -65782,53 +65798,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 413 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW02_GSU32_SNLL0_TT04_02_VW02_WG16_04_04 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: [4, 2] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -65836,76 +65863,86 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -65913,6 +65950,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65922,6 +65960,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -65931,133 +65970,159 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 414 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_02_08 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 2 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id001 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 2, 8] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 5120 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66067,6 +66132,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -66076,53 +66142,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 415 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id002 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -66130,8 +66207,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -66139,67 +66216,77 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 13312 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1024 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -66207,6 +66294,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66216,6 +66304,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -66225,53 +66314,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 416 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id002 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -66279,39 +66379,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 12 - LSPB: 12 - LVCA: 16 - LVCB: 16 - LVPA: 12 - LVPB: 12 - LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 768 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 576 - LdsOffsetB_Blk: 2624 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -66324,31 +66425,40 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 36 - MacroTile1: 48 - MacroTileA: 36 - MacroTileB: 48 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -66356,6 +66466,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66365,6 +66476,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -66374,47 +66486,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 417 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT036x048x16_GRVW01_GSU02_SNLL0_TT03_03_VW01_WG12_16_01 - SubGroup0: 12 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 12 + SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id004 - ThreadTile0: 3 - ThreadTile1: 3 - ThreadTileA: 3 - ThreadTileB: 3 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [12, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingType: B + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -66426,85 +66549,92 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 12 - LSPB: 12 - LVCA: 16 - LVCB: 16 - LVPA: 12 - LVPB: 12 - LdcEqualsLdd: false - LdsNumElements: 3456 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 576 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 36 - MacroTileA: 48 - MacroTileB: 36 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66514,6 +66644,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -66523,47 +66654,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 418 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x036x16_GRVW01_GSU08_SNLL0_TT06_03_VW01_WG08_12_02 - SubGroup0: 8 - SubGroup1: 12 - SubGroupA: 8 - SubGroupB: 12 - SuppresssNoLoadLoop: false - ThreadTile: [6, 3] - ThreadTile0: 6 - ThreadTile1: 3 - ThreadTileA: 6 - ThreadTileB: 3 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id003 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -66577,76 +66719,86 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 24 - LSPB: 24 - LVCA: 8 - LVCB: 8 - LVPA: 12 - LVPB: 12 - LdcEqualsLdd: false - LdsNumElements: 4608 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 768 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 48 - MacroTileA: 48 - MacroTileB: 48 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 6 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -66654,6 +66806,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66663,6 +66816,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -66672,47 +66826,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 419 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW02_GSU08_SNLL0_TT06_04_VW02_WG08_12_02 - SubGroup0: 8 - SubGroup1: 12 - SubGroupA: 8 - SubGroupB: 12 - SuppresssNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id003 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -66726,39 +66891,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 768 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -66772,30 +66938,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 48 - MacroTile1: 48 - MacroTileA: 48 - MacroTileB: 48 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -66803,6 +66978,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66812,6 +66988,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -66821,49 +66998,60 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 420 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW01_GSU08_SNLL0_TT03_03_VW01_WG16_16_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id004 - ThreadTile0: 3 - ThreadTile1: 3 - ThreadTileA: 3 - ThreadTileB: 3 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -66873,10 +67061,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -66887,27 +67075,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 16 - LSPB: 8 + LSPA: 64 + LSPB: 64 LVCA: 4 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 832 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -66921,37 +67110,45 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66961,6 +67158,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -66970,49 +67168,62 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 421 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id009 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -67022,8 +67233,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -67036,27 +67247,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 16 - LSPB: 16 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -67070,30 +67282,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -67101,6 +67320,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -67110,6 +67330,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -67119,49 +67340,33707 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 422 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 423 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS0_FL1_GRVW2_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 424 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 425 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 426 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 427 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 96 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 24 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 428 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT6_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 96 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 24 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 429 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT6_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 430 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 431 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 432 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 433 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 434 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 435 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT8_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 436 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 437 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 438 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS0_FL1_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT8_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 439 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 440 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 441 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 442 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 443 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 444 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 445 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 446 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 447 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 448 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 449 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_FL0_GRVW1_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 450 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 451 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 452 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 453 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 454 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 455 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 456 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 457 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 458 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 459 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 460 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 461 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 462 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3200 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 128 + LdsOffsetB_Blk: 2176 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 64 + MacroTileA: 8 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 463 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 464 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 465 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 466 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 467 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 468 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 469 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 470 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 471 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 472 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 473 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 474 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 475 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 476 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 477 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 478 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 479 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 480 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA4_LPB4_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 481 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT4_2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 482 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 483 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 484 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 4 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 485 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA4_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 486 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT8_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 487 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 488 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 489 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 490 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 491 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 492 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 493 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 494 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 495 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 496 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR0_TT4_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 497 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW1_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 498 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB0_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 499 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 500 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 501 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 502 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 503 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 504 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 505 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 506 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 507 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU4_LPA0_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 508 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 509 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 510 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 511 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 512 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 513 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3200 + LdsOffsetA: 0 + LdsOffsetB: 2112 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 514 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 515 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT8_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 516 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 517 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 518 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2304 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 519 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW02_GSU32_SNLL0_TT04_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 520 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_02_08 + SubGroup0: 16 + SubGroup1: 2 + SubGroupA: 16 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 2, 8] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 5120 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 521 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 13312 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 522 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 12 + LSPB: 12 + LVCA: 16 + LVCB: 16 + LVPA: 12 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 3392 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 36 + MacroTile1: 48 + MacroTileA: 36 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 523 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT036x048x16_GRVW01_GSU02_SNLL0_TT03_03_VW01_WG12_16_01 + SubGroup0: 12 + SubGroup1: 16 + SubGroupA: 12 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id004 + ThreadTile0: 3 + ThreadTile1: 3 + ThreadTileA: 3 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [12, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 12 + LSPB: 12 + LVCA: 16 + LVCB: 16 + LVPA: 12 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 3456 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 36 + MacroTileA: 48 + MacroTileB: 36 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 524 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x036x16_GRVW01_GSU08_SNLL0_TT06_03_VW01_WG08_12_02 + SubGroup0: 8 + SubGroup1: 12 + SubGroupA: 8 + SubGroupB: 12 + SuppresssNoLoadLoop: false + ThreadTile: [6, 3] + ThreadTile0: 6 + ThreadTile1: 3 + ThreadTileA: 6 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: *id003 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 24 + LSPB: 24 + LVCA: 8 + LVCB: 8 + LVPA: 12 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 48 + MacroTileA: 48 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 6 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 525 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW02_GSU08_SNLL0_TT06_04_VW02_WG08_12_02 + SubGroup0: 8 + SubGroup1: 12 + SubGroupA: 8 + SubGroupB: 12 + SuppresssNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id003 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 48 + MacroTile1: 48 + MacroTileA: 48 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 526 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW01_GSU08_SNLL0_TT03_03_VW01_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id004 + ThreadTile0: 3 + ThreadTile1: 3 + ThreadTileA: 3 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 527 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id009 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 528 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 529 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 530 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 531 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 532 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 533 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 534 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 535 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 536 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id009 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 537 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 538 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 539 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 540 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 541 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 542 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 543 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 544 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id009 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 545 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 546 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 547 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 548 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 549 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU08_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 550 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 551 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 552 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 553 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 554 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 555 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 556 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 557 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 558 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 559 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 560 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 561 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 562 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 563 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 564 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 565 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id020 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 566 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 567 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 568 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 569 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 570 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 571 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 572 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 573 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 574 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 575 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 576 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 577 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 578 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 579 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 580 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 581 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 582 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 583 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 584 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 585 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 586 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 587 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 13312 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 588 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 589 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 590 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 591 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 592 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 593 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id020 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 594 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 595 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 596 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 597 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 598 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 599 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 600 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id023 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 601 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL0_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: [6, 8] + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 602 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id024 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 603 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 604 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id023 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 605 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id024 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 606 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 607 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 608 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 609 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 610 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id027 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 611 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 612 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id029 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 613 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 614 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id031 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 615 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 616 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id027 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 617 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 618 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id029 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 619 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 620 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id031 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 621 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 3072 + LdsNumElementsAlignedB: 3072 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 3072 + LdsOffsetB_Blk: 11264 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 622 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 3072 + LdsNumElementsAlignedB: 3072 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 3072 + LdsOffsetB_Blk: 11264 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 623 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 624 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id027 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 625 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 626 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 627 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id027 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 628 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 629 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 4 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 4 + LSCB: 4 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 630 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id032 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 2 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 2 + LSCB: 2 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 631 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id032 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 632 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id035 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 633 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id005 + ThreadTile: *id033 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id006 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -67171,10 +101050,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -67182,47 +101061,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 16 - LSPB: 16 - LVCA: 4 - LVCB: 4 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67230,13 +101109,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 64 PerformanceSyncLocation: -1 @@ -67282,35 +101161,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 423 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 634 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 + ThreadTile: *id033 + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id006 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -67320,58 +101199,207 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 635 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67379,8 +101407,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -67431,47 +101459,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 424 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 636 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id007 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id006 + VectorWidth: 2 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -67480,26 +101508,26 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 - LSPA: 32 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 LSPB: 16 - LVCA: 4 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 896 + LdsNumElements: 1024 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 512 LdsOffsetB: 256 @@ -67509,17 +101537,17 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -67528,15 +101556,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67580,85 +101608,83 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 425 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 - SubGroup0: 16 + SolutionIndex: 637 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id005 + ThreadTile: *id033 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id008 + WorkGroup: *id035 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 LSPA: 16 LSPB: 16 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -67666,10 +101692,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67677,15 +101703,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67729,96 +101753,95 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 426 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 + SolutionIndex: 638 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_08_02_WGM01 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id006 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: *id038 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67826,15 +101849,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67878,47 +101899,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 427 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 + SolutionIndex: 639 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: *id036 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id006 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: *id037 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -67926,37 +101946,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 6752 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -67964,9 +101984,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -67976,14 +101996,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68027,47 +102045,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 428 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SolutionIndex: 640 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 4 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: *id037 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -68075,48 +102092,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 16 + LVCA: 32 + LVCB: 32 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3200 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElements: 6752 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68125,14 +102142,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68176,96 +102191,95 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 429 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 + SolutionIndex: 641 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM08 + SubGroup0: 16 SubGroup1: 4 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: *id037 + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68273,15 +102287,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68325,96 +102337,95 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 430 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SolutionIndex: 642 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW01_GSU01_LPA02_LPB02_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_04_04_WGM01 SubGroup0: 16 SubGroup1: 4 SubGroupA: 16 SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id039 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id009 + WorkGroup: *id037 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68422,15 +102433,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68474,35 +102483,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 431 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionIndex: 643 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW01_GSU08_LPA02_LPB02_PGR1_PLR1_TT02_02_USFGRO01_VW02_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id006 + WorkGroup: *id037 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -68512,58 +102521,57 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68572,14 +102580,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68623,35 +102629,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 432 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 + SolutionIndex: 644 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_08_02_WGM01 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ThreadTile: *id036 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id006 + VectorWidth: 4 + WorkGroup: *id038 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -68661,43 +102667,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 16 LSPB: 16 - LVCA: 4 - LVCB: 4 - LVPA: 4 - LVPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -68709,10 +102714,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68720,15 +102725,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68772,35 +102775,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 433 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 + SolutionIndex: 645 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG32_08_01_WGM01 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 + ThreadTile: *id039 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id006 + WorkGroup: *id040 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -68810,43 +102813,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -68858,10 +102860,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68869,15 +102871,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68921,35 +102921,36 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 434 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 - SubGroup0: 16 + SolutionIndex: 646 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id039 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 + VectorWidth: 4 + WorkGroup: *id040 WorkGroupMapping: 1 WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -68959,41 +102960,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69007,10 +103008,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69018,15 +103019,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69070,35 +103074,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 435 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 647 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id008 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -69108,8 +103122,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -69119,14 +103133,14 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 @@ -69168,14 +103182,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69219,35 +103236,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 436 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 648 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id008 + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -69257,37 +103284,37 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3200 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -69306,9 +103333,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69316,15 +103343,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69368,35 +103398,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 437 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 649 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -69406,41 +103446,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69454,10 +103494,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69465,15 +103505,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69517,35 +103560,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 438 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 650 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id009 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -69555,41 +103608,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69603,10 +103656,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69614,15 +103667,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69666,35 +103722,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 439 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 651 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -69704,8 +103770,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -69715,30 +103781,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69752,10 +103818,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69763,15 +103829,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69815,48 +103884,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 440 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 652 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -69864,47 +103943,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69912,15 +103991,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69964,14 +104046,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 441 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 653 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -69981,39 +104070,42 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -70021,39 +104113,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70061,15 +104153,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70113,48 +104208,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 442 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 654 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -70162,47 +104267,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 8 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 - LVPA: 32 - LVPB: 32 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 512 + LdsOffsetB_Blk: 4608 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70210,15 +104315,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70262,14 +104370,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 443 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU08_SNLL0_TT04_04_VW04_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 655 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -70279,16 +104394,19 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -70301,57 +104419,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70359,15 +104477,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70411,48 +104532,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 444 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 656 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -70460,36 +104591,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 32 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -70497,9 +104628,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -70508,15 +104639,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70560,33 +104694,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 445 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 657 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -70599,7 +104743,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -70609,47 +104753,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCB: 16 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70657,15 +104801,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70709,33 +104856,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 446 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 658 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -70748,57 +104905,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70806,15 +104963,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70858,33 +105018,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 447 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 659 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -70897,57 +105067,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70955,15 +105125,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71007,33 +105180,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 448 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 660 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -71045,58 +105228,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71104,20 +105283,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -71156,33 +105338,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 449 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 661 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -71195,36 +105387,36 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -71234,18 +105426,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71253,15 +105445,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71305,33 +105500,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 450 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 662 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -71344,7 +105549,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -71354,12 +105559,12 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -71371,9 +105576,9 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -71383,18 +105588,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71402,15 +105607,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71454,33 +105662,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 451 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 663 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -71493,57 +105711,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71551,15 +105769,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71603,46 +105824,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 452 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 664 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -71652,36 +105883,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 576 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -71689,10 +105920,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71700,15 +105931,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71752,33 +105986,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 453 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 665 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_8_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -71791,7 +106035,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -71801,12 +106045,12 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -71818,26 +106062,26 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -71850,14 +106094,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71901,33 +106148,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 454 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 666 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -71940,57 +106197,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71998,15 +106255,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72050,33 +106310,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 455 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 667 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72089,57 +106359,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72147,15 +106417,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72199,33 +106472,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 456 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 668 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72237,58 +106520,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1600 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72296,20 +106575,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -72348,33 +106630,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 457 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 669 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id017 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72386,8 +106678,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -72397,12 +106689,12 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -72414,29 +106706,25 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72445,20 +106733,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -72497,33 +106788,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 458 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 670 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id017 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72536,9 +106837,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -72546,47 +106847,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72594,15 +106895,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72646,33 +106950,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 459 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 671 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id020 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72685,7 +106999,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -72695,47 +107009,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72743,15 +107057,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72795,33 +107112,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 460 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 672 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72834,9 +107161,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -72844,46 +107171,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 32 LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 2112 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72893,14 +107220,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72944,33 +107274,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 461 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 673 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72983,57 +107323,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73041,15 +107381,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -73093,96 +107436,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 462 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 674 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73190,15 +107543,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -73242,85 +107598,95 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 463 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 675 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x64x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -73328,10 +107694,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73339,15 +107705,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -73391,33 +107760,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 464 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 676 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -73430,57 +107809,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73488,15 +107867,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -73540,33 +107922,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 465 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 677 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -73579,7 +107971,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -73589,47 +107981,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73637,15 +108029,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -73689,33 +108084,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 466 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 678 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -73728,7 +108133,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -73738,47 +108143,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73786,15 +108191,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -73838,33 +108246,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 467 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 679 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -73876,58 +108294,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3136 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 + LdsOffsetB: 2112 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73935,20 +108349,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -73987,33 +108404,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 468 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 680 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -74026,57 +108453,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74084,15 +108511,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -74136,33 +108566,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 469 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 681 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -74175,57 +108615,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74233,15 +108673,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -74285,33 +108728,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 470 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 682 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -74323,10 +108776,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -74334,47 +108787,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2624 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetB: 2112 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74382,20 +108831,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -74434,33 +108886,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 471 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG16_08_02 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 683 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 - WorkGroupMapping: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -74473,57 +108935,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74531,15 +108993,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -74583,96 +109048,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 472 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 684 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 9344 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74680,15 +109155,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -74732,85 +109210,95 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 473 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 685 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3600 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -74818,10 +109306,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74829,15 +109317,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -74881,46 +109372,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 474 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 686 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id017 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -74930,47 +109431,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 2 - LVCB: 2 - LVPA: 16 - LVPB: 16 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74978,15 +109479,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -75030,46 +109534,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 475 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 687 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -75079,43 +109593,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 64 LSPB: 64 - LVCA: 2 - LVCB: 2 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 6176 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -75127,15 +109641,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -75179,46 +109696,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 476 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 688 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -75228,43 +109755,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 64 LSPB: 64 - LVCA: 2 - LVCB: 2 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 6176 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -75276,15 +109803,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -75328,46 +109858,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 477 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 689 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -75377,63 +109917,66 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 2 - LVCB: 2 - LVPA: 16 - LVPB: 16 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 MinGlobalWriteVectorWidth: 1 NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -75477,48 +110020,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 478 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 690 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -75526,47 +110079,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75574,15 +110127,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -75626,46 +110182,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 479 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 691 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -75675,46 +110241,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -75723,15 +110289,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -75775,46 +110344,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 480 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 692 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -75824,47 +110403,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3136 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75872,20 +110447,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -75924,46 +110502,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 481 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 693 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -75973,36 +110561,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 13312 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -76010,9 +110598,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -76023,13 +110611,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -76073,46 +110664,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 482 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 694 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -76122,47 +110723,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76170,15 +110771,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -76222,46 +110826,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 483 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 695 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -76271,47 +110885,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76319,15 +110933,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -76371,46 +110988,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 484 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 696 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [8, 32, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -76420,47 +111047,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 13376 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 4160 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 9216 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 256 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 256 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76468,15 +111095,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -76520,46 +111150,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 485 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 697 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_8_VW4_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [8, 32, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -76569,36 +111209,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -76607,9 +111247,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76617,15 +111257,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -76669,96 +111312,102 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 486 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 698 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2624 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76766,20 +111415,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -76818,46 +111470,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 487 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 699 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id020 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -76867,36 +111529,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -76905,9 +111567,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76915,15 +111577,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -76967,46 +111632,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 488 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 700 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -77016,63 +111691,66 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 MinGlobalWriteVectorWidth: 1 NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -77116,46 +111794,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 489 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 701 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -77165,46 +111853,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -77213,15 +111901,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -77265,47 +111956,57 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 490 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 702 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -77314,47 +112015,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 12864 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 4160 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 512 + LdsOffsetB_Blk: 8704 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77362,15 +112063,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -77414,46 +112118,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 491 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 703 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 8 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -77463,36 +112177,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -77501,9 +112215,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77511,15 +112225,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -77563,35 +112280,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 492 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_08_02 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 704 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 - WorkGroupMapping: 8 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -77601,8 +112328,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -77616,10 +112343,10 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 @@ -77629,15 +112356,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7232 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -77650,9 +112377,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77660,15 +112387,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -77712,35 +112442,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 493 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 705 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -77750,8 +112490,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -77765,10 +112505,10 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 @@ -77778,15 +112518,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -77798,10 +112538,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77811,13 +112551,16 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -77861,35 +112604,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 494 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id023 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 706 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -77899,43 +112652,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -77947,10 +112700,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77958,15 +112711,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -78010,46 +112766,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 495 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL0_TT06_08_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 707 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: [6, 8] - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id021 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -78059,32 +112825,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 9216 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78095,11 +112861,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78107,15 +112873,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -78159,46 +112928,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 496 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id024 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 708 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -78212,28 +112991,28 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 13440 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 8192 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 9216 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78244,11 +113023,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78258,13 +113037,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -78308,48 +113090,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 497 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 709 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -78360,29 +113152,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 32 + LVCA: 2 LVCB: 4 - LVPA: 16 + LVPA: 32 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3408 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78393,11 +113185,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78405,15 +113197,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -78457,35 +113252,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 498 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id023 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 710 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -78495,8 +113300,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -78506,32 +113311,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78543,10 +113348,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78554,15 +113359,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -78606,47 +113414,57 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 499 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 711 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id024 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -78655,32 +113473,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6752 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78691,11 +113509,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78704,14 +113522,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -78755,46 +113576,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 500 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 712 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 + VectorWidth: 2 + WorkGroup: [8, 32, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -78804,32 +113635,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78840,7 +113671,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -78853,14 +113684,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -78904,46 +113738,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 501 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 713 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -78953,32 +113797,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78989,10 +113833,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -79001,15 +113845,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -79053,48 +113900,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 502 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 714 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -79102,32 +113959,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6752 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -79138,11 +113995,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79150,15 +114007,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -79202,33 +114062,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 503 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 715 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -79241,7 +114111,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -79251,32 +114121,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -79289,9 +114159,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79300,14 +114170,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -79351,33 +114224,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 504 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 716 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id027 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -79389,8 +114272,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -79400,12 +114283,12 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -79417,15 +114300,11 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2144 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -79438,9 +114317,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79448,20 +114327,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -79500,33 +114382,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 505 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 717 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU5_LPA2_LPB2_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id028 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -79539,42 +114431,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -79586,10 +114478,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79597,15 +114489,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -79649,33 +114544,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 506 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 718 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id029 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id026 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -79688,7 +114593,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -79698,12 +114603,12 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -79715,15 +114620,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -79747,14 +114652,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -79798,33 +114706,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 507 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 719 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id030 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -79837,42 +114755,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -79885,9 +114803,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 96 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79895,15 +114813,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -79947,33 +114868,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 508 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id031 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 720 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id026 + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -79986,7 +114917,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -79999,29 +114930,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -80033,10 +114964,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80044,15 +114975,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -80096,33 +115030,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 509 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 721 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -80135,7 +115079,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -80148,9 +115092,9 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -80162,15 +115106,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -80194,14 +115138,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -80245,33 +115192,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 510 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 722 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id027 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -80284,7 +115241,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -80297,9 +115254,9 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -80311,15 +115268,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -80331,10 +115288,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80343,14 +115300,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -80394,33 +115354,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 511 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 723 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id028 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -80433,42 +115403,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -80480,9 +115450,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 96 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -80491,15 +115461,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -80543,33 +115516,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 512 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id029 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 724 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id026 + WorkGroup: [8, 32, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -80582,8 +115565,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -80592,32 +115575,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6752 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -80629,10 +115612,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80640,15 +115623,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -80692,33 +115678,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 513 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id030 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 725 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 + VectorWidth: 2 + WorkGroup: [8, 32, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -80731,42 +115727,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -80778,10 +115774,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80789,15 +115785,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -80841,33 +115840,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 514 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id031 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 726 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id026 + WorkGroup: [8, 32, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -80880,7 +115889,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -80890,12 +115899,12 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -80907,15 +115916,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -80927,10 +115936,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80938,15 +115947,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -80990,46 +116002,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 515 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 727 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -81039,32 +116061,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 3072 - LdsNumElementsAlignedB: 3072 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 3072 - LdsOffsetB_Blk: 11264 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81075,10 +116097,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -81087,15 +116109,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -81139,46 +116164,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 516 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 728 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -81191,29 +116226,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 3072 - LdsNumElementsAlignedB: 3072 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 3072 - LdsOffsetB_Blk: 11264 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81224,11 +116259,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81236,15 +116271,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -81288,46 +116326,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 517 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 729 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -81340,29 +116388,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81373,11 +116421,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81385,15 +116433,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -81437,46 +116488,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 518 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 730 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_8_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id027 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -81486,32 +116547,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81522,7 +116583,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 128 MacroTileA: 64 @@ -81535,14 +116596,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -81586,46 +116650,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 519 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 731 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id028 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -81635,32 +116709,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81671,11 +116745,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81683,15 +116757,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -81735,46 +116812,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 520 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 732 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -81784,32 +116871,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81820,11 +116907,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81833,14 +116920,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -81884,46 +116974,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 521 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 733 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id027 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -81933,32 +117033,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81969,11 +117069,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81981,15 +117081,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -82033,33 +117136,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 522 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 734 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id028 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -82072,7 +117185,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -82082,12 +117195,12 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 @@ -82099,15 +117212,15 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 4160 LdsOffsetA: 0 LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 9280 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -82119,9 +117232,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 32 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -82130,15 +117243,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -82182,55 +117298,65 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 523 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 735 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -82238,25 +117364,25 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 4 - LSCB: 4 - LSPA: 16 - LSPB: 16 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 14464 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 4160 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 10304 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -82267,11 +117393,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82279,15 +117405,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -82331,48 +117460,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 524 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 736 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id032 - WorkGroupMapping: 1 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 2 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -82380,32 +117519,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 2 - LSCB: 2 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 10304 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -82416,11 +117555,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82429,14 +117568,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -82480,96 +117622,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 525 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 737 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id032 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 4 + LVPA: 32 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3424 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82577,15 +117729,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -82629,33 +117784,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 526 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 738 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id035 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -82668,57 +117833,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82726,15 +117891,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -82778,96 +117946,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 527 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 739 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82875,15 +118053,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -82927,96 +118108,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 528 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 740 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83024,15 +118215,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83076,96 +118270,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 529 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 741 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 + VectorWidth: 4 + WorkGroup: [8, 32, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83173,15 +118377,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83225,33 +118432,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 530 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 742 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -83264,57 +118481,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83322,15 +118539,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83374,32 +118594,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 531 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 743 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id035 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: false + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -83411,56 +118642,57 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1088 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -83469,13 +118701,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83519,13 +118756,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 532 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_08_02_WGM01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 744 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 - ThreadTile: *id036 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -83535,17 +118780,19 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id038 + VectorWidth: 4 + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -83557,57 +118804,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83615,13 +118863,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83665,95 +118918,102 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 533 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - ThreadTile: *id036 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 745 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id037 + VectorWidth: 4 + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6752 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4736 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetB: 4160 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83761,18 +119021,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -83811,95 +119076,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 534 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - ThreadTile: *id036 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 746 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT8_4_VW4_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id037 + VectorWidth: 4 + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6752 + LdsNumElements: 7296 LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2112 LdsOffsetB_Blk: 6208 - LdsPadA: 1 - LdsPadB: 1 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83907,13 +119183,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83957,91 +119238,102 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 535 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM08 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 747 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA4_LPB4_PGR1_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - ThreadTile: *id036 - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id037 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -84053,13 +119345,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -84103,95 +119400,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 536 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW01_GSU01_LPA02_LPB02_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_04_04_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 748 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 - ThreadTile: *id039 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id037 + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3456 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -84199,13 +119507,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -84249,33 +119562,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 537 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW01_GSU08_LPA02_LPB02_PGR1_PLR1_TT02_02_USFGRO01_VW02_WG16_04_04_WGM01 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 749 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id037 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -84287,16 +119610,17 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -84307,37 +119631,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6272 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 + LdsOffsetB_Blk: 5184 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -84345,13 +119669,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -84395,13 +119724,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 538 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_08_02_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 750 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id036 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -84412,16 +119749,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id038 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -84433,16 +119772,17 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -84453,20 +119793,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -84480,10 +119820,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -84495,9 +119835,14 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -84541,13 +119886,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 539 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG32_08_01_WGM01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id039 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 751 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 @@ -84558,16 +119911,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id040 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -84579,6 +119934,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -84593,8 +119949,8 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -84644,6 +120000,11 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -84687,26 +120048,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 540 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id039 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 752 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id040 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -84740,7 +120110,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -84754,15 +120124,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 10304 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -84786,7 +120156,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -84844,8 +120214,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 541 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 753 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84864,9 +120234,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -84888,10 +120258,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -84899,32 +120269,28 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 32 LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2688 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -84936,9 +120302,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -84948,12 +120314,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84963,7 +120329,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -85006,14 +120372,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 542 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 754 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -85026,9 +120392,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85053,7 +120419,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -85061,10 +120427,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -85072,21 +120438,21 @@ LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85098,10 +120464,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85110,10 +120476,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -85168,16 +120534,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 543 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 755 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -85188,9 +120554,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85223,10 +120589,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -85240,15 +120606,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85261,9 +120627,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85271,12 +120637,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -85330,29 +120696,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 544 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 756 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85388,29 +120754,29 @@ GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85423,9 +120789,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85434,12 +120800,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85492,15 +120858,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 545 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 757 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -85512,9 +120878,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85547,32 +120913,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85584,10 +120950,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85595,13 +120961,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85654,29 +121020,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 546 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 758 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85700,7 +121066,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -85709,32 +121075,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85746,10 +121112,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85758,11 +121124,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -85816,16 +121182,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 547 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 759 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -85836,9 +121202,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85871,10 +121237,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -85888,15 +121254,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85909,9 +121275,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85919,12 +121285,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -85978,28 +121344,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 548 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 760 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 32, 1] WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -86024,7 +121390,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -86033,32 +121399,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 64 - LVCA: 8 + LVCA: 4 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 512 - LdsOffsetB_Blk: 4608 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86070,10 +121436,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86082,11 +121448,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -86140,16 +121506,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 549 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM64 + SolutionIndex: 761 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -86160,8 +121526,8 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -86195,10 +121561,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -86212,15 +121578,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86233,9 +121599,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86243,12 +121609,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -86302,8 +121668,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 550 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 762 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86311,18 +121677,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 64 WorkGroupMappingType: B @@ -86340,7 +121706,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -86349,7 +121715,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -86365,24 +121731,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 32 - LVCA: 2 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 LVCB: 4 - LVPA: 32 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86393,11 +121759,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86405,11 +121771,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -86464,20 +121830,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 551 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 + SolutionIndex: 763 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -86485,10 +121851,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -86536,15 +121902,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 10304 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86626,8 +121992,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 552 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 764 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86648,7 +122014,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -86673,7 +122039,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -86681,7 +122047,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -86692,21 +122058,21 @@ LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86718,10 +122084,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86733,7 +122099,7 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -86788,16 +122154,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 553 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 765 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -86809,8 +122175,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -86860,15 +122226,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86881,9 +122247,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86891,12 +122257,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -86950,20 +122316,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 554 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 766 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -86971,8 +122337,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -86994,9 +122360,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -87005,7 +122371,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -87015,18 +122381,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 2048 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87038,10 +122408,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87049,12 +122419,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -87065,7 +122435,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -87108,29 +122478,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 555 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 767 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -87146,7 +122516,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -87171,24 +122541,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 13568 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 9344 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87199,11 +122569,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87215,9 +122585,9 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -87270,15 +122640,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 556 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 768 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -87291,10 +122661,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -87308,7 +122678,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -87333,24 +122703,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87361,11 +122731,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87373,8 +122743,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -87432,8 +122802,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 557 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 769 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87442,11 +122812,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -87454,9 +122824,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -87470,7 +122840,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -87487,7 +122857,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -87495,24 +122865,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 13568 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 9344 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87523,11 +122893,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87535,12 +122905,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -87594,31 +122964,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 558 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + SolutionIndex: 770 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -87632,7 +123002,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -87652,29 +123022,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 2 - LVCB: 2 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 512 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 576 - LdsOffsetB_Blk: 2624 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87685,7 +123055,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -87697,13 +123067,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -87756,31 +123126,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 559 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_8_VW2_WG16_8_1_WGM8 + SolutionIndex: 771 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -87794,7 +123164,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -87811,32 +123181,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 512 + LdsNumElements: 14592 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87847,11 +123217,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87859,13 +123229,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -87918,31 +123288,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 560 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 772 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -87956,7 +123326,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -87965,7 +123335,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -87973,32 +123343,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 32 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 4 + LVCA: 8 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 512 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -88009,11 +123379,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88022,11 +123392,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -88080,16 +123450,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 561 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 + SolutionIndex: 773 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -88100,11 +123470,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -88118,7 +123488,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88135,32 +123505,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 14592 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -88171,11 +123541,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88183,12 +123553,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -88242,8 +123612,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 562 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 774 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88251,22 +123621,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -88286,10 +123656,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -88307,17 +123677,21 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 32 LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1600 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -88341,13 +123715,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -88357,7 +123731,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -88373,6 +123747,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88382,6 +123757,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88400,29 +123776,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 563 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 + SolutionIndex: 775 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -88444,7 +123820,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -88472,10 +123848,14 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -88515,7 +123895,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -88531,6 +123911,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88540,6 +123921,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88558,8 +123940,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 564 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 776 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88567,7 +123949,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -88580,7 +123962,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -88596,7 +123978,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88605,7 +123987,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -88613,47 +123995,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88661,11 +124043,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -88693,6 +124075,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88702,6 +124085,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88720,31 +124104,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 565 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 + SolutionIndex: 777 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -88758,7 +124142,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88775,7 +124159,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -88783,23 +124167,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -88811,11 +124195,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88829,7 +124213,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -88855,6 +124239,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88864,6 +124249,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88882,16 +124268,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 566 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 778 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -88903,10 +124289,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -88947,37 +124333,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 32 + LSPA: 32 + LSPB: 16 LVCA: 4 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 512 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88985,13 +124371,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89017,6 +124403,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89026,6 +124413,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89044,20 +124432,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 567 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 + SolutionIndex: 779 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW2_WG4_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -89065,8 +124453,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -89082,7 +124470,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89091,7 +124479,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -89099,47 +124487,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89147,11 +124535,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -89179,6 +124567,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89188,6 +124577,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89206,31 +124596,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 568 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + SolutionIndex: 780 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -89244,7 +124634,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89264,44 +124654,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 256 - MacroTile1: 64 - MacroTileA: 256 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89309,13 +124699,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89341,6 +124731,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89350,6 +124741,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89368,31 +124760,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 569 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x64x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG32_8_1_WGM1 + SolutionIndex: 781 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -89426,28 +124818,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 128 - LSPB: 128 + LSPA: 32 + LSPB: 32 LVCA: 2 LVCB: 2 - LVPA: 32 - LVPB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -89460,10 +124852,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89471,13 +124863,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89503,6 +124895,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89512,6 +124905,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89530,29 +124924,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 570 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 782 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -89568,7 +124962,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89577,7 +124971,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -89585,47 +124979,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89633,13 +125027,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89665,6 +125059,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89674,6 +125069,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89692,31 +125088,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 571 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 783 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -89730,7 +125126,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89747,47 +125143,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89795,13 +125191,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89827,6 +125223,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89836,6 +125233,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89854,31 +125252,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 572 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 784 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT8_4_VW2_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -89892,7 +125290,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89901,7 +125299,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -89917,39 +125315,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 8 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89957,13 +125355,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89989,6 +125387,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89998,6 +125397,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90016,20 +125416,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 573 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 785 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 2 + SubGroupA: 8 + SubGroupB: 2 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -90037,10 +125437,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 2, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -90054,16 +125454,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -90079,19 +125479,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2112 - LdsPadA: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -90103,7 +125507,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -90119,7 +125523,7 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -90131,7 +125535,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -90147,6 +125551,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90156,6 +125561,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90174,31 +125580,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 574 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 786 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -90212,7 +125618,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -90237,23 +125643,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -90265,11 +125671,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90277,13 +125683,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -90309,6 +125715,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90318,6 +125725,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90336,31 +125744,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 575 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 787 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -90401,37 +125809,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 16 + LSPB: 16 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90439,13 +125847,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularB: 2 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -90471,6 +125879,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90480,6 +125889,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90498,20 +125908,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 576 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 788 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 8 + SubGroupA: 2 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -90519,8 +125929,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroup: [2, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -90545,7 +125955,7 @@ ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -90564,16 +125974,16 @@ LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 32 + LSPB: 64 LVCA: 4 - LVCB: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2624 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 2112 - LdsPadA: 4 + LdsOffsetB: 1024 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -90586,10 +125996,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90601,7 +126011,7 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -90629,6 +126039,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90638,6 +126049,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90656,15 +126068,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 577 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM64 + SolutionIndex: 789 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -90677,8 +126089,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -90694,7 +126106,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -90711,47 +126123,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90759,11 +126171,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -90791,6 +126203,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90800,6 +126213,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90818,16 +126232,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 578 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 790 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -90839,10 +126253,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -90856,7 +126270,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -90865,7 +126279,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -90881,39 +126295,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 8 + LSCB: 8 LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPB: 8 + LVCA: 2 + LVCB: 4 LVPA: 8 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsPadA: 4 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 2 MacroTile0: 32 - MacroTile1: 128 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 128 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90921,13 +126335,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -90953,6 +126367,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90962,6 +126377,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90980,15 +126396,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 579 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 791 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 2 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 2 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -91001,10 +126417,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [8, 2, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -91027,7 +126443,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -91038,29 +126454,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 128 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LVCB: 2 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3600 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -91072,10 +126488,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91083,13 +126499,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -91115,6 +126531,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91124,6 +126541,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91142,28 +126560,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 580 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 792 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -91180,7 +126598,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -91197,7 +126615,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -91205,39 +126623,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91245,11 +126663,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -91277,6 +126695,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91286,6 +126705,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91304,15 +126724,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 581 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 793 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT4_4_VW4_WG4_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -91324,11 +126744,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -91359,10 +126779,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -91376,15 +126796,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6176 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -91408,7 +126828,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -91439,6 +126859,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91448,6 +126869,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91466,8 +126888,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 582 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 794 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91475,7 +126897,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -91486,9 +126908,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -91504,7 +126926,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -91521,32 +126943,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6176 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -91557,11 +126979,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91570,12 +126992,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -91601,6 +127023,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91610,6 +127033,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91628,16 +127052,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 583 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 795 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -91648,11 +127072,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 8, 1] WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -91683,46 +127107,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 16 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -91731,13 +127155,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -91763,6 +127187,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91772,6 +127197,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91790,28 +127216,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 584 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + SolutionIndex: 796 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 2 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 2 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [2, 8, 4] WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -91828,7 +127254,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -91853,39 +127279,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91893,13 +127319,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -91925,6 +127351,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91934,6 +127361,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91952,31 +127380,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 585 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 797 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -91990,7 +127418,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -92010,44 +127438,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92055,11 +127483,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -92087,6 +127515,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92096,6 +127525,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92114,15 +127544,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 586 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 798 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -92135,10 +127565,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -92158,7 +127588,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -92172,7 +127602,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -92186,11 +127616,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92203,9 +127637,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92213,12 +127647,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -92229,13 +127663,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92245,6 +127680,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92254,6 +127690,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92272,8 +127709,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 587 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 799 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92281,18 +127718,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 4] + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -92317,7 +127754,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -92327,32 +127764,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92364,10 +127801,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92375,13 +127812,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92398,6 +127837,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92407,6 +127847,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92416,6 +127857,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92434,33 +127876,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 588 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 800 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92479,7 +127919,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -92489,10 +127929,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -92506,15 +127946,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92526,10 +127966,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92537,12 +127977,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -92560,6 +128002,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92569,6 +128012,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92578,6 +128022,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92596,8 +128041,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 589 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 801 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92605,24 +128050,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92654,7 +128097,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -92668,15 +128111,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92688,9 +128131,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -92699,11 +128142,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -92722,6 +128165,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92731,6 +128175,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92740,6 +128185,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92758,28 +128204,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 590 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 + SolutionIndex: 802 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -92816,7 +128262,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -92830,15 +128276,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13376 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 4160 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 9216 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92850,10 +128296,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92862,11 +128308,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -92884,6 +128330,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92893,6 +128340,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92902,6 +128350,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92920,15 +128369,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 591 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_8_VW4_WG8_32_1_WGM8 + SolutionIndex: 803 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [8, 8] ThreadTile0: 8 @@ -92940,8 +128389,8 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -92965,7 +128414,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -92992,15 +128441,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93012,9 +128461,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -93023,11 +128472,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -93046,6 +128497,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93055,6 +128507,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -93064,6 +128517,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -93082,8 +128536,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 592 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 804 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -93092,23 +128546,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93126,10 +128578,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -93148,17 +128600,21 @@ LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 32 + LSPB: 64 LVCA: 4 - LVCB: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2624 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93171,9 +128627,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93181,12 +128637,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -93197,13 +128655,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93213,6 +128672,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -93222,6 +128682,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -93240,33 +128701,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 593 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 + SolutionIndex: 805 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93295,7 +128754,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -93305,22 +128764,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93332,10 +128791,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93343,13 +128802,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -93366,6 +128825,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93375,6 +128835,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -93384,6 +128845,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -93402,29 +128864,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 594 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 + SolutionIndex: 806 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -93457,7 +128919,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -93474,15 +128936,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93494,10 +128956,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93505,12 +128967,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -93528,6 +128990,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93537,6 +129000,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -93546,6 +129010,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -93564,8 +129029,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 595 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 807 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -93573,12 +129038,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -93586,7 +129051,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -93609,7 +129074,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -93629,22 +129094,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93656,10 +129121,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93667,13 +129132,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -93690,6 +129157,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93699,6 +129167,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -93708,6 +129177,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -93726,33 +129196,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 596 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 808 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93771,8 +129239,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -93791,22 +129259,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 64 - LVCA: 8 + LVCA: 4 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12864 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 4160 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 512 - LdsOffsetB_Blk: 8704 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93818,10 +129286,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93829,12 +129297,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -93852,6 +129322,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93861,6 +129332,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -93870,6 +129342,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -93888,20 +129361,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 597 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG8_32_1_WGM64 + SolutionIndex: 809 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -93909,12 +129382,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93960,15 +129431,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93980,9 +129451,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -93991,11 +129462,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -94014,6 +129485,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94023,6 +129495,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -94032,6 +129505,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -94050,29 +129524,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 598 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 810 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -94122,15 +129596,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -94142,9 +129616,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -94153,11 +129627,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -94176,6 +129650,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94185,6 +129660,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -94194,6 +129670,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -94212,8 +129689,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 599 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 811 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -94222,10 +129699,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -94234,7 +129711,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -94257,16 +129734,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -94277,20 +129754,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -94304,10 +129781,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94317,10 +129794,12 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -94338,6 +129817,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94347,6 +129827,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -94356,6 +129837,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -94374,15 +129856,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 600 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + SolutionIndex: 812 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 @@ -94390,17 +129872,15 @@ ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94419,17 +129899,17 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -94441,18 +129921,18 @@ LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 7232 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -94467,9 +129947,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94477,13 +129957,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -94500,6 +129982,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94509,6 +129992,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -94518,6 +130002,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -94536,33 +130021,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 601 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 813 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94574,7 +130057,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -94582,16 +130065,16 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -94599,22 +130082,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13440 + LdsNumElements: 7232 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 9216 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -94627,10 +130110,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -94639,11 +130122,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 @@ -94662,6 +130145,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94671,6 +130155,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -94680,6 +130165,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -94698,31 +130184,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 602 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM8 + SolutionIndex: 814 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -94736,7 +130222,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -94744,15 +130230,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -94761,22 +130247,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13440 + LdsNumElements: 7232 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 9216 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -94789,10 +130275,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -94801,11 +130287,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 @@ -94824,6 +130310,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94833,6 +130320,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -94842,6 +130330,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -94860,31 +130349,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 603 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 815 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -94898,7 +130387,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -94907,7 +130396,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94923,23 +130412,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 32 - LVCA: 2 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 LVCB: 4 - LVPA: 32 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3408 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElements: 7200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true @@ -94951,11 +130440,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94963,12 +130452,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -94986,6 +130475,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94995,6 +130485,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -95004,6 +130495,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -95022,31 +130514,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 604 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 + SolutionIndex: 816 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -95067,17 +130559,17 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -95087,20 +130579,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3680 + LdsNumElements: 7264 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 + LdsOffsetB_Blk: 5184 LdsPadA: 2 LdsPadB: 2 LocalDotLayout: 1 @@ -95115,9 +130607,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95125,13 +130617,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -95148,6 +130642,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95157,6 +130652,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -95166,6 +130662,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -95184,33 +130681,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 605 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 817 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95229,42 +130724,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 64 - LVCA: 8 - LVCB: 4 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6752 - LdsNumElementsAlignedA: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -95276,9 +130771,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -95287,12 +130782,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -95310,6 +130807,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95319,6 +130817,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -95328,6 +130827,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -95346,33 +130846,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 606 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 + SolutionIndex: 818 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95391,42 +130889,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6240 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -95439,9 +130937,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95449,12 +130947,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -95472,6 +130972,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95481,6 +130982,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -95490,6 +130992,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -95508,8 +131011,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 607 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 819 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -95517,24 +131020,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95553,17 +131054,17 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -95573,10 +131074,10 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false @@ -95613,10 +131114,12 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -95634,6 +131137,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95643,6 +131147,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -95652,6 +131157,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -95670,8 +131176,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 608 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 820 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -95679,14 +131185,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -95695,8 +131201,6 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95715,8 +131219,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -95724,8 +131228,8 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -95735,20 +131239,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 32 - LVCA: 4 + LVCA: 8 LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6752 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 2 LdsPadB: 2 LocalDotLayout: 1 @@ -95762,10 +131266,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95773,12 +131277,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -95796,6 +131302,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95805,6 +131312,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -95814,6 +131322,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -95832,33 +131341,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 609 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 + SolutionIndex: 821 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95878,16 +131385,16 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -95899,18 +131406,18 @@ LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3680 + LdsNumElements: 7264 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 + LdsOffsetB_Blk: 5184 LdsPadA: 2 LdsPadB: 2 LocalDotLayout: 1 @@ -95925,9 +131432,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95935,13 +131442,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -95958,6 +131465,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95967,6 +131475,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -95976,6 +131485,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -95994,28 +131504,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 610 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 822 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -96038,18 +131548,18 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -96059,16 +131569,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2144 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 2 LdsPadB: 2 LocalDotLayout: 1 @@ -96083,9 +131597,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96093,12 +131607,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -96109,13 +131623,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96125,6 +131640,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -96134,6 +131650,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -96152,8 +131669,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 611 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU5_LPA2_LPB2_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 823 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -96161,12 +131678,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -96197,42 +131714,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6240 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -96245,9 +131762,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96255,12 +131772,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -96278,6 +131797,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96287,6 +131807,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -96296,6 +131817,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -96314,8 +131836,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 612 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 824 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -96323,24 +131845,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -96359,42 +131879,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -96406,10 +131926,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96418,11 +131938,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -96440,6 +131962,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96449,6 +131972,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -96458,6 +131982,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -96476,8 +132001,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 613 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 825 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -96485,24 +132010,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -96522,41 +132045,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -96568,10 +132091,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96580,11 +132103,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -96602,6 +132125,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96611,6 +132135,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -96620,6 +132145,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -96638,28 +132164,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 614 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG32_8_1_WGM1 + SolutionIndex: 826 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -96684,19 +132210,19 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -96705,20 +132231,20 @@ LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3680 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -96731,9 +132257,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96741,13 +132267,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -96764,6 +132290,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96773,6 +132300,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -96782,6 +132310,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -96800,29 +132329,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 615 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_1_WGM8 + SolutionIndex: 827 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -96845,42 +132374,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -96892,9 +132421,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -96903,12 +132432,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -96926,6 +132457,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96935,6 +132467,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -96944,6 +132477,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -96962,8 +132496,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 616 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 828 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -96972,23 +132506,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -97007,42 +132539,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 + LdsNumElements: 7296 LdsNumElementsAlignedA: 2112 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2112 LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -97066,11 +132598,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -97088,6 +132622,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -97097,6 +132632,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -97106,6 +132642,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -97124,8 +132661,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 617 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 829 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -97144,13 +132681,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -97170,41 +132705,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -97216,10 +132751,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97228,10 +132763,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -97250,6 +132785,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -97259,6 +132795,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -97268,6 +132805,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -97286,29 +132824,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 618 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 + SolutionIndex: 830 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -97331,20 +132869,20 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -97352,21 +132890,21 @@ LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 64 + LSPB: 32 LVCA: 8 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6752 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -97378,10 +132916,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97389,11 +132927,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -97412,6 +132952,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -97421,6 +132962,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -97430,6 +132972,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -97448,33 +132991,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 619 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM8 + SolutionIndex: 831 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -97494,41 +133035,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -97540,10 +133081,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97552,10 +133093,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -97574,6 +133115,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -97583,6 +133125,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -97592,6 +133135,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -97610,16 +133154,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 620 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 + SolutionIndex: 832 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -97630,8 +133174,8 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -97655,7 +133199,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -97665,10 +133209,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -97682,15 +133226,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6240 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -97703,9 +133247,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97713,12 +133257,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -97736,6 +133282,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -97745,6 +133292,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -97754,6 +133302,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -97772,8 +133321,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 621 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 833 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -97781,24 +133330,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -97827,10 +133374,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -97844,15 +133391,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -97876,7 +133423,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -97898,6 +133445,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -97907,6 +133455,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -97916,6 +133465,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -97934,8 +133484,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 622 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM8 + SolutionIndex: 834 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -97943,7 +133493,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 @@ -97954,9 +133504,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -97978,7 +133528,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -97992,29 +133542,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -98026,10 +133572,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98037,13 +133583,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98053,13 +133599,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -98069,6 +133616,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -98078,6 +133626,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -98096,29 +133645,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 623 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 835 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB0_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -98140,43 +133689,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -98188,10 +133733,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98199,13 +133744,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98215,13 +133762,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -98231,6 +133779,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -98240,6 +133789,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -98258,33 +133808,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 624 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_8_VW2_WG16_16_1_WGM64 + SolutionIndex: 836 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -98302,43 +133850,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -98350,10 +133894,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98361,13 +133905,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98377,13 +133923,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -98393,6 +133940,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -98402,6 +133950,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -98420,33 +133969,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 625 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM64 + SolutionIndex: 837 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -98464,8 +134011,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -98475,10 +134022,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -98492,15 +134039,11 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2128 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1088 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -98512,9 +134055,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -98523,11 +134066,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -98539,13 +134084,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -98555,6 +134101,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -98564,6 +134111,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -98582,8 +134130,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 626 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 838 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98592,23 +134140,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -98626,7 +134172,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -98637,32 +134183,28 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3680 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2128 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -98675,9 +134217,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98686,12 +134228,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98701,13 +134243,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -98717,6 +134260,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -98726,6 +134270,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -98744,15 +134289,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 627 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + SolutionIndex: 839 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -98764,9 +134309,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -98788,8 +134333,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -98799,32 +134344,28 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -98836,10 +134377,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98847,13 +134388,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98863,13 +134406,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -98879,6 +134423,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -98888,6 +134433,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -98906,33 +134452,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 628 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 840 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -98944,13 +134488,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -98961,32 +134505,28 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 4160 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 9280 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -98997,11 +134537,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99009,13 +134549,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99025,13 +134565,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99041,6 +134582,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -99050,6 +134592,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -99068,31 +134611,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 629 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 + SolutionIndex: 841 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -99106,14 +134649,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -99126,29 +134669,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 14464 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 4160 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 10304 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99159,11 +134698,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99171,13 +134710,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99187,13 +134728,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99203,6 +134745,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -99212,6 +134755,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -99230,33 +134774,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 630 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG8_32_1_WGM8 + SolutionIndex: 842 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -99268,13 +134810,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -99285,32 +134827,28 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 12416 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 10304 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99321,11 +134859,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99333,13 +134871,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99349,13 +134887,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99365,6 +134904,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -99374,6 +134914,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -99392,31 +134933,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 631 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 843 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -99430,16 +134971,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -99450,29 +134991,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 32 - LVCA: 2 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 LVCB: 4 - LVPA: 32 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3424 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElements: 2128 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99483,11 +135020,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99496,7 +135033,9 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -99511,13 +135050,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99527,6 +135067,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -99536,6 +135077,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -99554,16 +135096,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 632 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 + SolutionIndex: 844 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -99574,13 +135116,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -99592,16 +135132,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -99612,29 +135152,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 + LSCA: 16 + LSCB: 16 + LSPA: 64 LSPB: 64 - LVCA: 2 + LVCA: 4 LVCB: 4 - LVPA: 32 - LVPB: 32 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3680 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2128 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99645,10 +135181,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -99657,8 +135193,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -99673,13 +135209,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99689,6 +135226,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -99698,6 +135236,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -99716,31 +135255,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 633 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 845 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -99760,8 +135299,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -99774,7 +135313,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -99788,15 +135327,11 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99808,9 +135343,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -99819,11 +135354,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -99835,13 +135372,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99851,6 +135389,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -99860,6 +135399,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -99878,8 +135418,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 634 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 846 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -99887,24 +135427,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -99922,9 +135460,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -99936,7 +135474,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -99944,21 +135482,17 @@ LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 64 - LVCA: 8 + LSPB: 32 + LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 - LdsPadA: 4 - LdsPadB: 4 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99971,9 +135505,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99981,13 +135515,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99997,13 +135531,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100013,6 +135548,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -100022,6 +135558,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -100040,29 +135577,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 635 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 + SolutionIndex: 847 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -100084,8 +135621,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -100098,29 +135635,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -100132,10 +135665,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100143,13 +135676,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100159,13 +135694,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100175,6 +135711,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -100184,6 +135721,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -100202,33 +135740,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 636 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 848 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -100240,14 +135776,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -100260,29 +135796,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2144 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1088 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -100293,11 +135825,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100305,13 +135837,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100321,13 +135855,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100337,6 +135872,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -100346,6 +135882,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -100364,33 +135901,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 637 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 849 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -100402,13 +135937,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 @@ -100427,22 +135962,18 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 32 - LVCA: 4 - LVCB: 8 + LVCA: 2 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 864 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 576 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -100455,10 +135986,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -100471,9 +136002,9 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100483,13 +136014,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100499,6 +136031,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -100508,6 +136041,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -100526,16 +136060,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 638 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 + SolutionIndex: 850 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -100547,10 +136081,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -100570,8 +136104,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -100591,20 +136125,16 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1664 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 1088 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -100618,10 +136148,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100629,13 +136159,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100645,13 +136177,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100661,6 +136194,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -100670,6 +136204,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -100688,33 +136223,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 639 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM1 + SolutionIndex: 851 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -100735,7 +136268,7 @@ ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -100753,16 +136286,16 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 32 LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4736 + LdsNumElements: 1664 LdsOffsetA: 0 - LdsOffsetB: 4160 + LdsOffsetB: 1088 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -100776,9 +136309,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 256 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 256 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -100787,13 +136320,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100810,6 +136343,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100819,6 +136353,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -100828,6 +136363,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -100846,20 +136382,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 640 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT8_4_VW4_WG32_8_1_WGM1 + SolutionIndex: 852 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -100867,7 +136403,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -100890,7 +136426,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -100901,7 +136437,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -100911,20 +136447,16 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1664 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 576 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -100938,9 +136470,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -100949,13 +136481,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100965,13 +136497,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100981,6 +136514,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -100990,6 +136524,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -101008,20 +136543,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 641 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA4_LPB4_PGR1_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 853 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -101029,7 +136564,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -101052,7 +136587,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -101063,7 +136598,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -101080,13 +136615,9 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1664 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -101127,13 +136658,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101143,6 +136675,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -101152,6 +136685,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -101170,8 +136704,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 642 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 854 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101192,7 +136726,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -101200,7 +136734,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101228,10 +136762,11 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -101242,15 +136777,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101262,10 +136797,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101273,12 +136808,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -101289,6 +136824,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101296,6 +136832,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101305,6 +136842,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -101314,6 +136852,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -101332,37 +136871,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 643 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 + SolutionIndex: 855 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT6_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 + fractionalPerpOverhangA: 32 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101377,16 +136916,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -101394,25 +136933,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101424,10 +136964,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101435,12 +136975,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -101451,6 +136993,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101458,6 +137001,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101467,6 +137011,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -101476,6 +137021,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -101494,8 +137040,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 644 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 856 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101503,12 +137049,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -101519,12 +137065,10 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101539,7 +137083,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -101556,6 +137100,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -101566,15 +137111,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101586,9 +137131,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -101597,11 +137142,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -101613,13 +137160,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101629,6 +137178,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -101638,6 +137188,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -101656,8 +137207,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 645 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 857 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101665,11 +137216,11 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -101681,12 +137232,10 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101703,7 +137252,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -101714,44 +137263,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101759,11 +137309,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -101775,6 +137325,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101782,6 +137333,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101791,6 +137343,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -101800,6 +137353,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -101818,28 +137372,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 646 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 858 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -101848,7 +137402,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101864,7 +137418,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -101880,40 +137434,41 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 16 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 - LVPA: 16 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12416 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 10304 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101921,12 +137476,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -101937,6 +137492,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101944,6 +137500,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101953,6 +137510,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -101962,6 +137520,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -101980,28 +137539,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 647 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 859 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -102010,7 +137569,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102018,15 +137577,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -102034,7 +137593,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -102042,21 +137601,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 32 + LSPB: 64 LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2688 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2112 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102067,11 +137631,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102081,11 +137645,14 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102095,13 +137662,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102111,6 +137680,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -102120,6 +137690,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -102138,15 +137709,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 648 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 + SolutionIndex: 860 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -102159,16 +137730,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102176,15 +137745,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -102192,7 +137761,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -102200,23 +137769,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 32 + LSPB: 64 LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2112 + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -102229,11 +137799,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102243,11 +137813,14 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102257,6 +137830,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102264,6 +137838,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102273,6 +137848,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -102282,6 +137858,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -102300,16 +137877,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 649 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 + SolutionIndex: 861 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -102321,16 +137898,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102344,8 +137919,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -102358,10 +137933,11 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -102372,15 +137948,11 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2144 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1088 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102392,9 +137964,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -102403,13 +137975,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102419,13 +137994,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102435,6 +138012,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -102444,6 +138022,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -102462,37 +138041,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 650 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 862 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102506,8 +138083,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -102517,30 +138094,27 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -102555,9 +138129,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102567,11 +138141,14 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102581,13 +138158,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102597,6 +138176,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -102606,6 +138186,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -102624,15 +138205,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 651 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 + SolutionIndex: 863 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -102645,16 +138226,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102686,25 +138265,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102717,9 +138297,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102731,9 +138311,10 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102743,6 +138324,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102750,6 +138332,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102759,6 +138342,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -102768,6 +138352,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -102786,16 +138371,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 652 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 864 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -102807,8 +138392,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -102816,7 +138401,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102830,9 +138415,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -102848,23 +138433,20 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 64 - LVCA: 8 + LVCA: 4 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 + LdsOffsetB: 1088 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -102878,10 +138460,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102894,8 +138476,9 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102905,13 +138488,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102921,6 +138506,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -102930,6 +138516,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -102948,16 +138535,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 653 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 865 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -102969,8 +138556,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -102978,7 +138565,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102995,7 +138582,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -103010,40 +138597,41 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103051,13 +138639,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103067,6 +138656,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -103074,6 +138664,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -103083,6 +138674,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -103092,6 +138684,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -103110,20 +138703,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 654 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 866 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -103131,8 +138724,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -103140,7 +138733,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103148,64 +138741,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1088 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103213,13 +138807,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103229,6 +138826,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -103236,6 +138834,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -103245,6 +138844,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -103254,6 +138854,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -103272,37 +138873,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 655 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 867 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_LPA0_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103310,15 +138909,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -103330,44 +138929,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103375,13 +138975,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103391,6 +138994,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -103398,6 +139002,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -103407,6 +139012,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -103416,6 +139022,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -103434,37 +139041,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 656 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 868 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103472,64 +139077,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103537,13 +139143,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103553,6 +139162,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -103560,6 +139170,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -103569,6 +139180,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -103578,6 +139190,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -103596,37 +139209,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 657 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 869 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103634,16 +139245,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -103654,44 +139265,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 12416 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 10304 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103699,13 +139311,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103715,6 +139330,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -103722,6 +139338,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -103731,6 +139348,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -103740,6 +139358,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -103758,37 +139377,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 658 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 870 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103804,7 +139421,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -103812,48 +139429,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 32 - LVCA: 4 + LSPA: 32 + LSPB: 8 + LVCA: 8 LVCB: 8 LVPA: 16 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103861,13 +139479,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103877,6 +139496,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -103884,6 +139504,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -103893,6 +139514,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -103902,6 +139524,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -103920,29 +139543,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 659 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM64 + SolutionIndex: 871 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x16_SE_EPS1_FL1_GRVW2_LPA0_LPB0_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -103950,7 +139573,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103965,9 +139588,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -103978,44 +139601,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104023,13 +139647,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104039,6 +139666,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -104046,6 +139674,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -104055,6 +139684,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -104064,6 +139694,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -104082,37 +139713,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 660 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + SolutionIndex: 872 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104120,7 +139749,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -104128,8 +139757,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -104137,32 +139766,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 + LSCA: 8 + LSCB: 8 + LSPA: 96 LSPB: 64 - LVCA: 8 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 24 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104173,11 +139803,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104185,13 +139815,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104201,6 +139832,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -104208,6 +139840,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -104217,6 +139850,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -104226,6 +139860,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -104244,37 +139879,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 661 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 873 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104282,15 +139917,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -104306,25 +139941,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 13568 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104335,10 +139971,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -104347,13 +139983,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104363,13 +140002,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -104379,6 +140020,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -104388,6 +140030,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -104406,37 +140049,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 662 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 + SolutionIndex: 874 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104444,15 +140085,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -104468,25 +140109,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104497,11 +140139,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104509,13 +140151,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104525,6 +140170,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -104532,6 +140178,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -104541,6 +140188,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -104550,6 +140198,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -104568,8 +140217,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 663 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 875 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -104577,12 +140226,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -104592,13 +140241,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104606,16 +140253,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -104623,32 +140270,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 13568 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104659,11 +140307,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104671,13 +140319,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104687,13 +140338,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -104703,6 +140356,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -104712,6 +140366,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -104730,20 +140385,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 664 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM1 + SolutionIndex: 876 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -104751,16 +140406,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104768,14 +140421,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -104792,25 +140445,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104821,11 +140475,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104833,13 +140487,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104849,6 +140506,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -104856,6 +140514,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -104865,6 +140524,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -104874,6 +140534,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -104892,8 +140553,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 665 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 877 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -104901,12 +140562,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -104914,15 +140575,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104930,7 +140589,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -104954,25 +140613,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104983,10 +140643,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -104995,13 +140655,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105011,6 +140672,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -105018,6 +140680,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -105027,6 +140690,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -105036,6 +140700,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -105054,8 +140719,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 666 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 878 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -105063,11 +140728,11 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -105076,15 +140741,15 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105092,14 +140757,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -105116,25 +140781,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105145,7 +140811,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -105159,11 +140825,14 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105173,6 +140842,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -105180,6 +140850,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -105189,6 +140860,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -105198,6 +140870,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -105216,8 +140889,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 667 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 879 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -105225,7 +140898,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -105238,15 +140911,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105254,7 +140925,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -105263,7 +140934,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -105278,25 +140949,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105307,11 +140979,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105323,9 +140995,10 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105335,6 +141008,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -105342,6 +141016,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -105351,6 +141026,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -105360,6 +141036,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -105378,8 +141055,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 668 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 880 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -105387,12 +141064,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -105400,15 +141077,15 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105424,8 +141101,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -105436,44 +141113,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 32 - LVCA: 4 - LVCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105482,12 +141160,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105497,6 +141176,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -105504,6 +141184,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -105542,28 +141223,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 669 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 881 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -105572,7 +141253,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105587,16 +141268,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -105604,13 +141285,14 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false @@ -105641,17 +141323,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105661,6 +141348,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -105668,6 +141356,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -105690,6 +141379,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -105706,8 +141396,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 670 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 882 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -105715,7 +141405,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -105731,12 +141421,10 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105744,15 +141432,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -105760,62 +141448,68 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105825,6 +141519,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -105832,6 +141527,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -105854,6 +141550,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -105870,37 +141567,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 671 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM1 + SolutionIndex: 883 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105908,14 +141603,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -105928,27 +141623,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -105961,25 +141657,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105989,6 +141690,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -105996,6 +141698,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -106018,6 +141721,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106034,16 +141738,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 672 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM8 + SolutionIndex: 884 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -106054,17 +141758,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -106079,9 +141781,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -106092,58 +141794,64 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 + LSPA: 64 + LSPB: 64 LVCA: 4 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106153,6 +141861,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -106160,6 +141869,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -106182,6 +141892,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106198,37 +141909,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 673 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW2_WG4_4_8_WGM8 + SolutionIndex: 885 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -106236,7 +141945,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -106244,7 +141953,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -106252,62 +141961,66 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106317,6 +142030,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -106324,6 +142038,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -106346,6 +142061,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106362,37 +142078,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 674 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM8 + SolutionIndex: 886 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -106400,7 +142116,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -106409,7 +142125,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -106420,58 +142136,62 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106481,6 +142201,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -106488,6 +142209,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -106510,6 +142232,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106526,37 +142249,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 675 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM16 + SolutionIndex: 887 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -106564,7 +142287,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -106584,27 +142307,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -106617,25 +142341,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106645,6 +142372,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -106652,6 +142380,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -106674,6 +142403,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106690,16 +142420,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 676 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM64 + SolutionIndex: 888 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -106710,17 +142440,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -106728,16 +142458,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -106748,58 +142478,58 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106809,13 +142539,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -106838,6 +142570,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106854,37 +142587,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 677 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM64 + SolutionIndex: 889 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_PGR0_PLR1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 64 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -106892,78 +142625,84 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 8 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6656 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106973,6 +142712,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -106980,6 +142720,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -107002,6 +142743,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -107018,37 +142760,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 678 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT8_4_VW2_WG4_4_8_WGM64 + SolutionIndex: 890 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -107056,15 +142796,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -107072,62 +142812,68 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 32 - LSPB: 8 - LVCA: 2 - LVCB: 4 - LVPA: 8 - LVPB: 4 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -107137,6 +142883,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -107144,6 +142891,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -107166,6 +142914,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -107182,37 +142931,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 679 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM1 + SolutionIndex: 891 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 2 - SubGroupA: 8 - SubGroupB: 2 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 2, 4] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -107227,9 +142974,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -107244,23 +142991,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 LVPA: 32 LVPB: 32 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107274,24 +143022,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -107301,13 +143054,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -107330,6 +143085,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -107346,8 +143102,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 680 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 892 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -107355,12 +143111,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -107371,12 +143127,10 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -107384,14 +143138,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -107408,23 +143162,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107437,25 +143192,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -107465,13 +143225,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -107494,6 +143256,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -107510,37 +143273,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 681 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM1 + SolutionIndex: 893 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -107555,7 +143316,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -107572,54 +143333,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -107629,6 +143396,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -107636,6 +143404,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -107658,6 +143427,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -107674,37 +143444,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 682 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM1 + SolutionIndex: 894 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 8 - SubGroupA: 2 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [2, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -107718,8 +143486,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -107736,6 +143504,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -107746,9 +143515,13 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107763,23 +143536,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -107789,13 +143567,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -107818,6 +143598,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -107834,8 +143615,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 683 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 895 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -107844,11 +143625,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -107856,15 +143637,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -107872,7 +143651,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -107880,70 +143659,74 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -107953,6 +143736,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -107960,6 +143744,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -107982,6 +143767,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -107998,37 +143784,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 684 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM1 + SolutionIndex: 896 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [4, 4, 8] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -108036,7 +143822,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -108045,7 +143831,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -108056,58 +143842,62 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 8 - LVCA: 2 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -108117,6 +143907,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -108124,6 +143915,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -108146,6 +143938,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -108162,37 +143955,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 685 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM8 + SolutionIndex: 897 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 2 - SubGroupA: 8 - SubGroupB: 2 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 2, 4] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -108208,7 +144001,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -108224,23 +144017,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 + LSPA: 64 + LSPB: 128 + LVCA: 4 LVCB: 2 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -108254,24 +144048,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -108281,13 +144078,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -108310,6 +144109,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -108326,28 +144126,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 686 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM8 + SolutionIndex: 898 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -108356,7 +144156,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -108364,7 +144164,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -108381,61 +144181,65 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -108445,6 +144249,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -108452,6 +144257,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -108474,6 +144280,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -108490,37 +144297,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 687 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT4_4_VW4_WG4_4_8_WGM8 + SolutionIndex: 899 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -108552,6 +144359,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -108562,13 +144370,13 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -108583,23 +144391,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -108609,6 +144420,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -108616,6 +144428,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -108638,6 +144451,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -108654,8 +144468,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 688 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 900 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -108663,12 +144477,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -108676,7 +144490,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -108684,7 +144498,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -108699,16 +144513,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -108716,23 +144530,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -108746,24 +144561,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -108773,13 +144593,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -108802,6 +144624,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -108818,20 +144641,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 689 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM64 + SolutionIndex: 901 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -108839,16 +144662,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -108856,23 +144677,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -108880,54 +144701,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 4 - LVPB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -108937,13 +144764,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -108966,6 +144795,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -108982,16 +144812,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 690 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM64 + SolutionIndex: 902 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 8 - SubGroupA: 2 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -109003,16 +144833,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [2, 8, 4] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -109020,14 +144848,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -109040,58 +144868,64 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -109101,13 +144935,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -109130,6 +144966,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -109146,16 +144983,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 691 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG4_4_8_WGM64 + SolutionIndex: 903 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -109166,17 +145003,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -109184,16 +145019,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -109204,58 +145039,64 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -109265,13 +145106,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -109294,6 +145137,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -109310,20 +145154,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 692 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM64 + SolutionIndex: 904 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -109331,16 +145175,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -109355,7 +145197,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -109368,10 +145210,11 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -109382,9 +145225,9 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -109403,23 +145246,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -109429,6 +145277,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -109459,6 +145308,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -109475,8 +145325,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 693 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 905 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -109484,28 +145334,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -109513,47 +145361,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -109566,27 +145415,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -109596,8 +145446,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -109626,6 +145477,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -109642,8 +145494,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 694 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 906 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -109651,26 +145503,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -109678,47 +145532,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -109731,27 +145586,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -109761,8 +145617,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -109791,6 +145648,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -109807,8 +145665,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 695 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 907 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -109816,26 +145674,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -109843,7 +145703,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -109851,39 +145711,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -109896,25 +145757,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -109924,6 +145788,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -109954,6 +145819,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -109970,8 +145836,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 696 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 908 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -109979,28 +145845,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -110008,7 +145874,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -110017,7 +145883,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -110032,23 +145898,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -110061,25 +145928,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -110089,8 +145959,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -110119,6 +145990,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -110135,8 +146007,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 697 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 909 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -110144,12 +146016,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -110157,15 +146029,15 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -110173,16 +146045,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -110193,27 +146065,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -110226,27 +146099,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -110256,8 +146130,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -110286,6 +146161,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -110302,8 +146178,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 698 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 910 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -110311,26 +146187,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -110338,16 +146216,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -110362,23 +146240,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -110391,27 +146270,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -110421,8 +146301,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -110451,6 +146332,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -110467,8 +146349,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 699 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 911 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -110476,12 +146358,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -110489,13 +146371,15 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -110503,7 +146387,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -110512,7 +146396,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -110527,23 +146411,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -110556,25 +146441,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -110584,6 +146472,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -110614,6 +146503,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -110630,8 +146520,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 700 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 912 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -110639,14 +146529,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -110654,13 +146544,13 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -110675,30 +146565,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false @@ -110729,17 +146620,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -110749,8 +146645,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -110779,6 +146676,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -110795,8 +146693,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 701 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 913 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -110804,7 +146702,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 @@ -110815,17 +146713,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -110841,29 +146737,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false @@ -110894,19 +146791,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -110916,8 +146816,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -110946,6 +146847,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -110962,8 +146864,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 702 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 914 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -110971,18 +146873,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -110990,7 +146892,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -111006,15 +146908,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -111022,13 +146924,14 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false @@ -111059,19 +146962,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -111081,8 +146987,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -111111,6 +147018,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -111127,8 +147035,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 703 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 915 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -111136,7 +147044,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 @@ -111149,13 +147057,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -111170,16 +147078,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -111187,13 +147095,14 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false @@ -111224,17 +147133,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -111244,6 +147158,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -111274,6 +147189,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -111290,8 +147206,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 704 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 916 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -111299,28 +147215,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -111335,16 +147249,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -111352,13 +147266,14 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false @@ -111389,17 +147304,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -111409,6 +147329,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -111439,6 +147360,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -111455,8 +147377,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 705 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 917 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -111464,7 +147386,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 @@ -111480,12 +147402,10 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -111501,41 +147421,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -111547,26 +147468,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -111576,6 +147500,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -111606,6 +147531,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -111622,8 +147548,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 706 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 918 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -111631,18 +147557,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -111650,7 +147576,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -111666,15 +147592,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -111682,25 +147608,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -111712,26 +147639,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -111741,6 +147671,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -111771,6 +147702,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -111787,8 +147719,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 707 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 919 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -111796,11 +147728,11 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -111815,7 +147747,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -111843,10 +147775,11 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -111857,15 +147790,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -111877,24 +147810,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -111904,8 +147840,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -111934,6 +147871,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -111950,8 +147888,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 708 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 920 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -111959,18 +147897,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -111980,7 +147918,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -112008,10 +147946,11 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -112022,15 +147961,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -112042,24 +147981,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -112069,6 +148011,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -112099,6 +148042,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -112115,8 +148059,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 709 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 921 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -112124,18 +148068,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -112145,7 +148089,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -112161,41 +148105,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7200 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -112207,24 +148152,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -112234,8 +148182,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -112264,6 +148213,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -112280,8 +148230,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 710 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 922 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -112289,20 +148239,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -112310,7 +148260,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -112325,42 +148275,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -112372,26 +148323,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -112401,6 +148353,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -112431,6 +148384,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -112447,8 +148401,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 711 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 923 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -112456,26 +148410,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -112490,16 +148446,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -112507,25 +148463,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -112537,26 +148494,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -112566,6 +148524,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -112596,6 +148555,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -112612,8 +148572,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 712 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 924 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -112621,26 +148581,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -112655,42 +148617,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -112702,26 +148665,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -112731,8 +148695,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -112761,6 +148726,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -112777,8 +148743,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 713 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 925 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -112786,26 +148752,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -112820,42 +148788,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -112867,26 +148836,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -112896,8 +148866,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -112926,6 +148897,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -112942,8 +148914,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 714 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 926 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -112951,26 +148923,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -112985,42 +148959,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -113032,26 +149007,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -113061,6 +149037,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -113091,6 +149068,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -113107,8 +149085,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 715 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 927 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -113116,26 +149094,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -113151,41 +149131,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -113197,24 +149178,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -113224,6 +149208,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -113254,6 +149239,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -113270,8 +149256,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 716 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 928 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -113279,20 +149265,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -113300,7 +149286,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -113315,71 +149301,77 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -113389,6 +149381,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -113419,6 +149412,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -113435,37 +149429,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 717 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 929 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -113482,14 +149474,14 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -113497,56 +149489,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -113556,6 +149552,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -113586,6 +149583,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -113602,35 +149600,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 718 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 930 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -113646,7 +149644,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -113654,64 +149652,68 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 32 - LVCA: 8 + LVCA: 4 LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -113721,6 +149723,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -113751,6 +149754,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -113767,35 +149771,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 719 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 931 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -113803,14 +149807,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -113827,25 +149831,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -113856,25 +149861,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -113884,8 +149894,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -113914,12 +149925,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -113930,8 +149943,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 720 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 932 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -113939,28 +149952,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -113968,14 +149979,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -113992,25 +150003,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -114021,25 +150033,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -114049,6 +150066,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -114079,12 +150097,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -114095,8 +150115,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 721 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 933 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -114104,12 +150124,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -114119,13 +150139,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -114133,7 +150151,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -114157,25 +150175,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -114186,27 +150205,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -114216,6 +150238,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -114246,12 +150269,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -114262,8 +150287,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 722 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 934 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -114271,26 +150296,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -114298,7 +150323,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -114314,7 +150339,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -114322,25 +150347,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -114351,27 +150377,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -114381,6 +150410,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -114411,12 +150441,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -114427,8 +150459,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 723 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 935 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -114436,11 +150468,11 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -114451,11 +150483,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -114463,14 +150495,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -114479,7 +150511,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -114487,25 +150519,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -114516,25 +150549,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -114544,6 +150582,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -114574,12 +150613,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -114590,8 +150631,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 724 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 936 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -114599,28 +150640,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -114634,17 +150673,17 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -114652,25 +150691,22 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -114682,26 +150718,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -114711,7 +150750,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -114741,12 +150781,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -114757,8 +150799,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 725 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 937 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -114766,11 +150808,11 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -114779,13 +150821,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -114800,16 +150842,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -114817,25 +150859,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -114847,24 +150890,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -114874,6 +150922,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -114904,12 +150953,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -114920,8 +150971,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 726 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 938 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -114929,11 +150980,11 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -114942,15 +150993,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -114982,6 +151031,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -114992,15 +151042,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -115013,25 +151063,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -115041,6 +151094,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -115071,12 +151125,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -115087,8 +151143,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 727 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 939 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -115096,26 +151152,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -115123,7 +151179,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -115131,15 +151187,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -115147,25 +151203,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -115176,25 +151233,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -115204,8 +151264,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -115234,12 +151295,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -115250,8 +151313,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 728 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 940 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -115259,12 +151322,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -115274,13 +151337,13 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -115288,44 +151351,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -115337,25 +151405,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -115365,7 +151436,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -115395,12 +151467,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -115411,37 +151485,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 729 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB0_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 941 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -115449,45 +151523,46 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -115498,27 +151573,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -115528,6 +151604,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: @@ -115558,12 +151635,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -115574,35 +151653,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 730 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 942 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS0_FL1_GRVW2_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -115616,39 +151697,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -115660,26 +151746,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -115689,8 +151776,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -115719,12 +151807,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -115735,35 +151825,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 731 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 943 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -115771,16 +151863,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -115791,25 +151883,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2128 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -115820,7 +151917,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -115828,19 +151925,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -115850,7 +151948,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -115880,12 +151979,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -115896,8 +151997,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 732 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 944 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -115916,15 +152017,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -115932,16 +152035,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -115952,25 +152055,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2128 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -115981,7 +152089,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -115989,17 +152097,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -116009,7 +152120,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -116039,12 +152151,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -116055,8 +152169,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 733 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 945 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116075,17 +152189,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -116100,7 +152214,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -116113,25 +152227,26 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116143,26 +152258,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -116172,6 +152288,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: @@ -116202,12 +152319,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -116218,35 +152337,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 734 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 946 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -116254,16 +152375,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -116274,25 +152395,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 96 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 24 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116303,25 +152429,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -116331,7 +152460,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -116361,12 +152491,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -116377,37 +152509,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 735 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 947 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT6_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -116415,16 +152547,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -116435,25 +152567,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 96 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 24 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116464,27 +152601,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -116494,7 +152632,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -116524,12 +152663,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -116540,35 +152681,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 736 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 + SolutionIndex: 948 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT6_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -116576,45 +152719,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116625,25 +152773,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -116653,7 +152806,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -116683,12 +152837,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -116699,37 +152855,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 737 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 + SolutionIndex: 949 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -116737,15 +152891,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -116757,25 +152911,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 64 + LSPB: 128 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2128 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116786,19 +152945,21 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 1 NumLoadsB: 1 @@ -116807,6 +152968,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -116816,7 +152978,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -116846,12 +153009,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -116862,8 +153027,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 738 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 950 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116872,25 +153037,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -116898,15 +153063,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -116918,25 +153083,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 64 + LSPB: 128 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2128 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116947,25 +153117,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -116975,7 +153150,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -117005,12 +153181,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -117021,8 +153199,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 739 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 951 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -117031,27 +153209,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -117059,15 +153235,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -117079,25 +153255,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -117108,19 +153289,21 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 @@ -117128,7 +153311,8 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -117138,7 +153322,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -117168,12 +153353,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -117184,35 +153371,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 740 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 952 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -117220,45 +153407,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -117269,25 +153461,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -117297,7 +153492,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -117327,12 +153523,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -117343,37 +153541,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 741 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 953 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -117381,16 +153579,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -117401,25 +153599,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -117430,19 +153633,21 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 @@ -117450,7 +153655,8 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -117460,8 +153666,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -117490,12 +153697,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -117506,35 +153715,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 742 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM8 + SolutionIndex: 954 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT8_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -117542,16 +153751,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -117562,25 +153771,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2144 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -117591,27 +153805,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -117621,7 +153838,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -117651,12 +153869,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -117667,35 +153887,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 743 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 955 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -117709,9 +153929,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -117719,7 +153939,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -117727,21 +153947,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 8 LSCB: 8 LSPA: 64 - LSPB: 32 - LVCA: 2 + LSPB: 64 + LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 864 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -117753,24 +153978,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -117780,8 +154008,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -117810,12 +154039,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -117826,20 +154057,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 744 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 956 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -117847,8 +154078,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -117856,7 +154087,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -117864,16 +154095,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -117888,21 +154119,22 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -117913,27 +154145,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -117943,6 +154176,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: @@ -117973,12 +154207,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -117989,20 +154225,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 745 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 957 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS0_FL1_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT8_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -118010,14 +154246,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -118031,7 +154269,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -118049,21 +154287,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -118075,24 +154318,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -118102,7 +154348,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -118132,12 +154379,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -118148,20 +154397,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 746 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 958 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -118169,7 +154418,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -118178,7 +154427,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -118186,13 +154435,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -118210,21 +154459,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -118235,25 +154489,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -118263,8 +154520,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -118293,12 +154551,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -118309,37 +154569,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 747 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 + SolutionIndex: 959 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT8_8_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -118347,13 +154607,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -118371,21 +154631,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -118396,25 +154661,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -118424,7 +154692,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -118454,12 +154723,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -118470,31 +154741,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 748 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 960 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_8_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -118525,7 +154796,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -118543,13 +154814,13 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 1536 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -118563,24 +154834,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -118621,12 +154895,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -118637,8 +154913,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 749 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT6_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 961 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -118647,10 +154923,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -118662,7 +154938,7 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 32 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -118675,7 +154951,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -118683,8 +154959,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -118695,61 +154971,64 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -118790,12 +155069,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -118806,31 +155087,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 750 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 962 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -118842,7 +155123,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -118850,15 +155131,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -118868,55 +155149,58 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -118928,7 +155212,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -118957,12 +155241,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -118973,31 +155259,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 751 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 963 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119009,15 +155295,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -119029,28 +155315,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -119063,25 +155349,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -119122,12 +155413,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -119138,15 +155431,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 752 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 + SolutionIndex: 964 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -119158,13 +155451,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119183,7 +155474,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -119204,51 +155495,56 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 + LSPA: 32 LSPB: 64 LVCA: 8 LVCB: 4 - LVPA: 8 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -119289,12 +155585,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -119305,14 +155603,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 753 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG4_16_4_WGM8 + SolutionIndex: 965 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -119326,12 +155624,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119350,7 +155646,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -119371,48 +155667,48 @@ KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 64 + LSPA: 32 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 32 + LVPA: 16 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -119459,12 +155755,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -119475,14 +155773,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 754 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 966 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -119496,10 +155794,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119518,7 +155818,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -119539,48 +155839,48 @@ KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 64 + LSPA: 32 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 32 + LVPA: 16 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3168 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 576 - LdsOffsetB_Blk: 2624 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -119627,12 +155927,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -119643,14 +155945,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 755 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 967 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -119664,10 +155966,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119679,23 +155983,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -119705,44 +156009,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2144 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -119761,8 +156071,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -119791,12 +156101,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -119807,20 +156119,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 756 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 968 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_FL0_GRVW1_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -119828,10 +156140,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119843,23 +156155,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -119869,44 +156181,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2176 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -119925,7 +156243,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -119955,12 +156273,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -119971,15 +156291,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 757 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 969 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -119992,10 +156312,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120007,23 +156327,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -120033,48 +156353,52 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -120121,12 +156445,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -120137,15 +156463,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 758 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 970 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -120158,12 +156484,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120181,10 +156505,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -120192,10 +156516,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -120204,41 +156528,49 @@ LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2176 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -120255,7 +156587,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -120285,12 +156617,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -120301,15 +156635,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 759 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 971 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -120321,13 +156655,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120339,16 +156671,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -120365,22 +156697,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 32 LVCA: 4 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -120393,7 +156725,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -120401,12 +156733,16 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -120453,12 +156789,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -120469,8 +156807,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 760 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + SolutionIndex: 972 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -120493,9 +156831,7 @@ WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120507,80 +156843,80 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 8 + LoopUnroll: 4 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 8 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -120623,12 +156959,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -120639,31 +156977,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 761 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_LPA0_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 973 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120675,76 +157015,76 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 + LSCA: 8 + LSCB: 8 + LSPA: 64 LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3456 - LdsNumElementsAlignedA: 320 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 320 - LdsOffsetB_Blk: 2368 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 8 + LoopUnroll: 4 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 8 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -120791,12 +157131,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -120807,31 +157149,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 762 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 974 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120843,54 +157187,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3456 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -120898,24 +157242,24 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -120959,12 +157303,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -120975,31 +157321,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 763 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + SolutionIndex: 975 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121011,16 +157359,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -121031,34 +157379,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3456 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -121066,21 +157414,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -121127,12 +157475,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -121143,31 +157493,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 764 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 + SolutionIndex: 976 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121207,20 +157559,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 8 + LSPA: 8 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 16 - LVPB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -121234,14 +157586,16 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -121293,12 +157647,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -121309,15 +157665,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 765 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x16_SE_EPS1_FL1_GRVW2_LPA0_LPB0_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + SolutionIndex: 977 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -121330,7 +157686,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -121354,9 +157710,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -121367,7 +157723,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -121375,20 +157731,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 + LSPA: 8 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 16 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -121402,21 +157758,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -121463,12 +157819,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -121479,31 +157837,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 766 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 + SolutionIndex: 978 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121515,7 +157875,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -121523,7 +157883,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -121535,54 +157895,56 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 96 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 24 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -121629,12 +157991,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -121645,31 +158009,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 767 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 979 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -121683,7 +158047,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -121691,66 +158055,68 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 128 - LVCA: 4 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -121770,7 +158136,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -121799,12 +158165,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -121815,31 +158183,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 768 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 980 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121851,7 +158219,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -121859,8 +158227,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -121871,54 +158239,56 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 128 - LVCA: 4 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -121967,12 +158337,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -121983,31 +158355,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 769 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 981 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122019,16 +158391,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -122039,56 +158411,56 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 + LSCA: 16 + LSCB: 16 + LSPA: 8 LSPB: 64 - LVCA: 2 + LVCA: 8 LVCB: 4 - LVPA: 32 - LVPB: 32 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 128 + LdsOffsetB_Blk: 2176 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 4 + MacroTile0: 8 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 8 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -122106,7 +158478,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -122135,12 +158507,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -122151,31 +158525,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 770 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 982 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122195,7 +158571,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -122207,7 +158583,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -122215,52 +158591,54 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 16 LSPB: 64 - LVCA: 4 + LVCA: 16 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -122303,12 +158681,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -122319,28 +158699,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 771 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 983 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -122355,15 +158735,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -122375,54 +158755,58 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -122469,12 +158853,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -122485,33 +158871,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 772 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 984 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122531,7 +158915,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -122551,46 +158935,48 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 16 LSPB: 64 - LVCA: 4 + LVCA: 16 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -122639,12 +159025,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -122655,14 +159043,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 773 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 985 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -122676,8 +159064,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -122691,16 +159079,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -122717,48 +159105,52 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 + LSCA: 16 + LSCB: 16 + LSPA: 16 LSPB: 64 - LVCA: 2 + LVCA: 16 LVCB: 4 - LVPA: 32 - LVPB: 32 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -122805,12 +159197,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -122821,20 +159215,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 774 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 986 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -122842,12 +159236,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122866,8 +159258,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -122876,7 +159268,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -122887,46 +159279,50 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 16 LSPB: 64 - LVCA: 8 + LVCA: 16 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -122973,12 +159369,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -122989,14 +159387,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 775 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 + SolutionIndex: 987 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -123010,12 +159408,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123036,7 +159432,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123056,36 +159452,36 @@ LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 16 LVCA: 8 - LVCB: 8 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123095,14 +159491,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -123152,6 +159548,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -123162,20 +159559,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 776 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 988 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -123183,7 +159580,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -123207,7 +159604,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123218,7 +159615,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -123227,36 +159624,36 @@ LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 16 LVCA: 8 - LVCB: 8 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123266,14 +159663,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -123323,6 +159720,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -123333,29 +159731,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 777 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 989 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -123377,8 +159775,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123389,7 +159787,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -123397,37 +159795,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123437,8 +159835,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -123494,6 +159892,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -123504,28 +159903,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 778 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 990 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -123547,9 +159946,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123560,7 +159959,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -123568,37 +159967,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 8 LVPA: 16 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123608,10 +160007,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -123665,6 +160062,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -123675,31 +160073,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 779 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 991 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123718,20 +160118,20 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -123739,16 +160139,16 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -123758,18 +160158,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123779,12 +160179,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -123834,6 +160236,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -123844,15 +160247,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 780 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 992 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -123864,13 +160267,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123889,9 +160290,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123911,15 +160312,15 @@ LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 16 LVCA: 4 - LVCB: 4 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -123929,18 +160330,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123950,8 +160351,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -124005,6 +160408,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -124015,15 +160419,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 781 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 993 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -124036,12 +160440,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124060,9 +160462,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -124070,10 +160472,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -124082,15 +160484,15 @@ LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 16 LVCA: 4 - LVCB: 4 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -124100,18 +160502,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124121,8 +160523,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -124176,6 +160580,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -124186,15 +160591,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 782 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 994 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -124206,13 +160611,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124230,10 +160633,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -124253,32 +160656,36 @@ LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 16 LVCA: 4 - LVCB: 4 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124288,8 +160695,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -124306,7 +160715,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -124343,6 +160752,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -124353,15 +160763,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 783 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_PGR0_PLR1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 995 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -124374,12 +160784,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124399,19 +160807,19 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -124419,37 +160827,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6656 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124459,14 +160867,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -124516,6 +160924,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -124526,29 +160935,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 784 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM1 + SolutionIndex: 996 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -124562,14 +160971,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -124578,49 +160987,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 2 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124630,14 +161039,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -124687,6 +161094,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -124697,31 +161105,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 785 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM8 + SolutionIndex: 997 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124733,7 +161143,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -124742,14 +161152,14 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -124759,24 +161169,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 128 - LVCA: 4 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -124787,11 +161197,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124801,14 +161211,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -124822,7 +161232,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -124858,6 +161268,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -124868,8 +161279,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 786 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 998 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -124878,11 +161289,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -124892,7 +161303,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124904,23 +161315,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -124930,24 +161341,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -124958,11 +161369,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124972,14 +161383,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -125029,6 +161438,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -125039,8 +161449,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 787 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 999 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA4_LPB4_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125049,11 +161459,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -125061,9 +161471,11 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125075,14 +161487,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -125095,30 +161507,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125129,11 +161541,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125143,14 +161555,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -125200,6 +161610,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -125210,8 +161621,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 788 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 1000 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT4_2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125220,21 +161631,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125254,7 +161667,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -125266,7 +161679,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -125274,22 +161687,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125301,10 +161714,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125314,14 +161727,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -125371,6 +161784,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -125381,8 +161795,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 789 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 1001 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125391,17 +161805,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -125417,7 +161831,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -125426,41 +161840,41 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 + LSCA: 16 + LSCB: 16 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 - LVPA: 32 - LVPB: 32 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125471,11 +161885,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125485,12 +161899,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -125540,6 +161954,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -125550,8 +161965,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 790 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 1002 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125560,21 +161975,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -125595,7 +162010,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -125623,15 +162038,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125658,6 +162073,8 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -125711,6 +162128,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -125721,8 +162139,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 791 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW2_WG16_16_1_WGM8 + SolutionIndex: 1003 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125743,11 +162161,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125766,7 +162182,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -125794,14 +162210,14 @@ LVPA: 32 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 576 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 - LdsPadA: 0 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -125829,6 +162245,8 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -125846,7 +162264,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -125882,6 +162300,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -125892,8 +162311,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 792 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 1004 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA4_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125914,11 +162333,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125937,7 +162354,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -125950,7 +162367,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -125965,15 +162382,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125985,10 +162402,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125999,11 +162416,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -126053,6 +162472,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -126063,8 +162483,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 793 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 1005 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT8_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -126073,23 +162493,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126108,16 +162526,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -126129,22 +162547,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -126157,9 +162575,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126169,13 +162587,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -126224,6 +162644,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -126234,33 +162655,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 794 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 1006 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126272,7 +162691,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -126280,15 +162699,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -126298,24 +162717,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -126326,11 +162745,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126340,15 +162759,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -126361,7 +162780,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -126397,6 +162816,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -126407,20 +162827,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 795 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1007 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -126428,10 +162848,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126443,7 +162863,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -126451,42 +162871,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -126497,11 +162917,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126511,15 +162931,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -126532,7 +162952,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -126568,6 +162988,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -126578,31 +162999,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 796 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1008 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126614,7 +163035,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -126623,7 +163044,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -126640,24 +163061,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -126668,11 +163089,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126682,15 +163103,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -126703,7 +163124,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -126739,6 +163160,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -126749,20 +163171,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 797 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1009 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -126770,10 +163192,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126785,7 +163207,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -126793,7 +163215,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -126801,38 +163223,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -126840,10 +163262,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126853,14 +163275,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -126874,7 +163296,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -126910,6 +163332,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -126920,31 +163343,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 798 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1010 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126956,7 +163379,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -126964,57 +163387,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127024,14 +163447,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -127081,6 +163504,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -127091,31 +163515,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 799 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1011 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127127,15 +163551,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -127144,37 +163568,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -127182,10 +163606,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127195,12 +163619,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -127214,7 +163640,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -127250,6 +163676,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -127260,33 +163687,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 800 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1012 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127298,54 +163723,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -127353,10 +163778,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127366,11 +163791,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -127385,7 +163812,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -127421,6 +163848,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -127431,33 +163859,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 801 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1013 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127469,54 +163895,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -127524,10 +163950,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127537,11 +163963,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -127592,6 +164020,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -127602,33 +164031,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 802 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1014 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127640,15 +164067,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -127656,8 +164083,8 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -127666,28 +164093,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -127695,10 +164122,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127708,12 +164135,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -127763,6 +164192,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -127773,33 +164203,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 803 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1015 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR0_TT4_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127811,54 +164239,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -127866,10 +164294,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127879,11 +164307,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -127898,7 +164328,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -127934,6 +164364,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -127944,33 +164375,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 804 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1016 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW1_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127982,54 +164411,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -128037,10 +164466,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128050,11 +164479,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -128069,7 +164500,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -128105,6 +164536,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -128115,33 +164547,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 805 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1017 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB0_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128153,54 +164583,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -128208,10 +164638,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128221,11 +164651,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -128276,6 +164708,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -128286,33 +164719,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 806 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1018 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128324,7 +164755,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -128332,57 +164763,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 - LSPB: 32 + LSPB: 8 LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128392,14 +164823,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -128413,7 +164844,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -128449,6 +164880,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -128459,31 +164891,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 807 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 1019 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128495,7 +164927,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -128503,57 +164935,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 - LSPB: 32 + LSPB: 8 LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128563,14 +164995,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -128584,7 +165016,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -128620,6 +165052,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -128630,31 +165063,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 808 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM8 + SolutionIndex: 1020 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128666,7 +165099,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -128674,57 +165107,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128734,14 +165167,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -128755,7 +165188,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -128791,6 +165224,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -128801,31 +165235,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 809 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 1021 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128837,14 +165271,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -128854,48 +165288,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128905,14 +165339,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -128962,6 +165394,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -128972,31 +165405,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 810 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 1022 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129008,7 +165443,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -129025,48 +165460,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129076,14 +165511,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -129133,6 +165568,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -129143,31 +165579,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 811 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 1023 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129179,7 +165615,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -129188,7 +165624,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -129196,7 +165632,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -129205,39 +165641,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129247,14 +165683,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -129304,6 +165740,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -129314,31 +165751,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 812 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 1024 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129350,16 +165787,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -129367,48 +165804,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129418,14 +165855,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -129475,6 +165910,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -129485,31 +165921,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 813 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 1025 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129521,65 +165959,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 + LSCA: 32 + LSCB: 32 + LSPA: 8 LSPB: 32 - LVCA: 8 + LVCA: 32 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129589,12 +166027,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -129608,7 +166048,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -129644,6 +166084,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -129654,33 +166095,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 814 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 1026 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU4_LPA0_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129699,8 +166138,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -129708,11 +166147,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -129720,37 +166159,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 32 - LVCA: 8 + LVCA: 4 LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129760,12 +166199,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -129815,43 +166256,44 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 815 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 1027 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129870,16 +166312,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -129891,37 +166333,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129931,12 +166373,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -129950,7 +166394,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -129986,43 +166430,44 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 816 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 1028 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130041,16 +166486,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -130062,37 +166507,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130102,12 +166547,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -130157,43 +166604,44 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 817 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 1029 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130225,7 +166673,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -130240,15 +166688,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130260,10 +166708,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130273,12 +166721,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -130328,18 +166776,21 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 818 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 1030 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -130348,19 +166799,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -130376,7 +166827,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -130402,24 +166853,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130430,11 +166881,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130444,11 +166895,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -130463,7 +166914,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -130499,18 +166950,21 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 819 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 1031 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -130519,11 +166973,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -130531,9 +166985,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -130554,7 +167008,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -130567,7 +167021,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -130582,15 +167036,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130602,9 +167056,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -130615,11 +167069,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -130634,7 +167090,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -130670,18 +167126,21 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 820 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 1032 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -130690,23 +167149,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130724,8 +167181,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -130753,15 +167210,11 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3200 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130774,9 +167227,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130786,12 +167239,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -130804,7 +167259,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -130841,18 +167296,21 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 821 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 1033 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -130861,11 +167319,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -130873,11 +167331,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130909,7 +167365,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -130924,15 +167380,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130945,9 +167401,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130957,12 +167413,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -131012,18 +167468,21 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 822 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 1034 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT8_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -131032,17 +167491,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -131067,8 +167526,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -131088,36 +167547,36 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 64 - LVCA: 8 + LVCA: 4 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -131128,13 +167587,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -131185,30 +167642,33 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 823 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 + SolutionIndex: 1035 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -131216,10 +167676,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131239,15 +167701,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -131259,37 +167721,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 64 - LVCA: 8 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131299,15 +167761,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -131356,25 +167818,28 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 824 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM8 + SolutionIndex: 1036 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -131387,8 +167852,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -131410,19 +167875,19 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -131430,33 +167895,33 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -131470,15 +167935,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -131527,18 +167992,21 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 825 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 + SolutionIndex: 1037 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -131557,8 +168025,8 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -133817,24 +170285,10 @@ - [95, 6513.35] - - [4288, 1024, 1, 128] - [80, 4291.67] - - - [512, 2048, 1, 49] - - [126, 4554.98] - - - [512, 128, 1, 784] - - [119, 3195.29] - - - [2048, 512, 1, 49] - - [127, 4253.33] - - - [1024, 256, 1, 196] - - [123, 4039.33] - - [256, 64, 1, 3136] - [121, 3015.27] - - [256, 1024, 1, 196] - [125, 4225.35] - - - [64, 256, 1, 3136] - - [122, 3058.35] - - - [128, 512, 1, 784] - - [120, 3380.28] - - - [64, 64, 1, 3136] - - [124, 1372.34] - - [1024, 1024, 1, 3328] - [237, 8705.0] - - [2048, 200, 1, 3200] @@ -134667,8 +171121,6 @@ - [231, 5745.62] - - [1024, 200, 1, 1280] - [223, 4446.13] - - - [4096, 512, 1, 4096] - - [141, 9264.39] - - [2048, 256, 1, 3200] - [231, 7842.75] - - [2048, 512, 1, 15360] @@ -135169,64 +171621,28 @@ - [237, 6628.17] - - [4096, 1024, 1, 6144] - [139, 9592.98] - - - [1280, 384, 1, 64] - - [270, 3196.88] - - [256, 64, 1, 1225] - [271, 1194.67] - - [2048, 320, 1, 64] - [273, 3449.26] - - - [256, 48, 1, 1225] - - [264, 913.398] - - - [2048, 192, 1, 64] - - [263, 2516.58] - - [1024, 128, 1, 289] - [277, 2869.68] - - - [1280, 192, 1, 64] - - [256, 1872.46] - - - [192, 32, 1, 1225] - - [261, 505.806] - - - [1280, 448, 1, 64] - - [257, 3078.87] - - [384, 64, 1, 1225] - [262, 1511.33] - - [2048, 384, 1, 64] - [275, 3836.25] - - - [288, 48, 1, 1225] - - [258, 1032.59] - - [64, 80, 1, 5329] - [274, 888.167] - - [1024, 384, 1, 289] - [268, 4291.52] - - [2048, 448, 1, 64] - [267, 3783.52] - - - [1280, 320, 1, 64] - - [273, 2776.95] - - - [192, 64, 1, 1225] - - [258, 926.897] - - - [384, 192, 1, 1225] - - [269, 2560.0] - - - [1536, 256, 1, 64] - - [276, 2621.44] - - - [192, 48, 1, 1225] - - [261, 698.614] - - - [768, 128, 1, 289] - - [278, 2291.12] - - - [1024, 256, 1, 289] - - [276, 4064.36] - - [768, 192, 1, 289] - [272, 2690.33] - - - [1536, 384, 1, 64] - - [259, 3145.73] - - [288, 64, 1, 1225] - [261, 1142.67] - - - [1024, 192, 1, 289] - - [266, 3243.13] - - [384, 96, 1, 1225] - [279, 1844.71] - - - [160, 64, 1, 5329] - - [265, 1564.48] - - - [768, 160, 1, 289] - - [260, 2386.58] - - [1024, 3392, 1, 4096] - [305, 8502.92] - - [1024, 3301, 1, 4096] @@ -137411,5852 +173827,6340 @@ - [372, 5309.25] - - [2816, 8976, 1, 256] - [383, 9409.56] + - - [1728, 320, 1, 64] + - [419, 3205.57] + - - [1152, 128, 1, 784] + - [466, 3498.96] + - - [576, 96, 1, 5329] + - [452, 3947.92] + - - [864, 96, 1, 1225] + - [473, 3009.67] + - - [256, 128, 1, 784] + - [463, 1536.49] + - - [1440, 320, 1, 196] + - [416, 4824.62] + - - [192, 48, 1, 1225] + - [494, 820.465] + - - [2592, 384, 1, 289] + - [434, 7353.01] + - - [192, 80, 36, 10368] + - [484, 5360.04] + - - [896, 192, 1, 289] + - [451, 3076.56] + - - [768, 128, 1, 289] + - [476, 2351.81] + - - [64, 256, 1, 3136] + - [502, 1809.16] + - - [1280, 384, 1, 64] + - [416, 3171.1] + - - [512, 144, 1, 196] + - [474, 1445.07] + - - [1344, 192, 1, 289] + - [457, 4376.52] + - - [288, 64, 1, 21609] + - [468, 3396.12] + - - [400, 32, 1, 784] + - [495, 922.353] + - - [288, 32, 1, 21609] + - [506, 2816.01] + - - [1280, 448, 1, 64] + - [419, 3253.56] + - - [3456, 256, 1, 169] + - [431, 5822.44] + - - [2304, 256, 1, 196] + - [429, 4931.98] + - - [384, 192, 1, 1225] + - [477, 2720.39] + - - [832, 48, 1, 49] + - [472, 344.518] + - - [832, 192, 1, 49] + - [454, 1099.36] + - - [1280, 192, 1, 64] + - [455, 2069.56] + - - [192, 32, 1, 784] + - [494, 459.627] + - - [288, 48, 1, 1225] + - [501, 1176.0] + - - [512, 112, 1, 196] + - [469, 1277.21] + - - [224, 192, 36, 2592] + - [486, 7369.56] + - - [528, 32, 1, 196] + - [460, 440.374] + - - [192, 128, 36, 1568] + - [485, 8245.76] + - - [4032, 384, 1, 64] + - [430, 5898.24] + - - [576, 64, 1, 3136] + - [475, 2671.11] + - - [2048, 32, 1, 1001] + - [477, 2323.0] + - - [480, 64, 1, 196] + - [462, 752.64] + - - [512, 256, 1, 196] + - [464, 2528.55] + - - [864, 96, 1, 289] + - [474, 1958.4] + - - [896, 128, 1, 289] + - [477, 2725.73] + - - [192, 64, 1, 784] + - [492, 898.675] + - - [1200, 64, 1, 1225] + - [476, 2780.14] + - - [1296, 288, 1, 196] + - [415, 3826.18] + - - [576, 96, 1, 5041] + - [456, 3795.58] + - - [1024, 256, 1, 289] + - [445, 4488.13] + - - [1024, 2048, 1, 49] + - [435, 5077.1] + - - [192, 64, 36, 6272] + - [479, 7514.98] + - - [4096, 512, 1, 4096] + - [441, 10276.0] + - - [192, 32, 1, 1225] + - [495, 556.686] + - - [1024, 256, 1, 196] + - [455, 3892.44] + - - [1120, 192, 1, 289] + - [444, 3752.81] + - - [400, 48, 1, 196] + - [469, 480.0] + - - [1728, 224, 1, 1225] + - [422, 5575.77] + - - [800, 96, 1, 784] + - [476, 2668.94] + - - [1152, 384, 1, 64] + - [426, 3077.34] + - - [4608, 512, 1, 49] + - [433, 4676.6] + - - [1792, 256, 1, 289] + - [426, 5345.94] + - - [864, 128, 1, 784] + - [476, 3816.2] + - - [1728, 384, 1, 169] + - [428, 5191.68] + - - [480, 16, 1, 196] + - [497, 241.231] + - - [1568, 256, 1, 289] + - [416, 4723.41] + - - [1152, 448, 1, 64] + - [422, 3356.72] + - - [512, 64, 1, 196] + - [461, 802.816] + - - [1344, 224, 1, 289] + - [416, 3519.63] + - - [9216, 512, 1, 4096] + - [439, 9146.02] + - - [27, 32, 1, 22201] + - [507, 264.356] + - - [1152, 192, 1, 784] + - [446, 4904.08] + - - [1536, 256, 1, 64] + - [414, 2578.47] + - - [800, 128, 1, 196] + - [476, 1991.11] + - - [800, 64, 1, 196] + - [471, 1150.83] + - - [864, 208, 1, 196] + - [448, 2684.72] + - - [1440, 320, 1, 49] + - [417, 2313.44] + - - [512, 128, 1, 784] + - [467, 2780.32] + - - [720, 192, 1, 5041] + - [442, 5410.46] + - - [256, 64, 1, 784] + - [499, 1163.5] + - - [256, 48, 1, 1225] + - [494, 1075.2] + - - [576, 192, 1, 3136] + - [442, 4833.01] + - - [160, 64, 1, 5329] + - [496, 1753.5] + - - [3456, 384, 1, 289] + - [436, 7341.75] + - - [32, 32, 36, 43808] + - [490, 1378.03] + - - [1344, 512, 1, 64] + - [415, 3822.93] + - - [192, 16, 1, 784] + - [495, 228.073] + - - [3456, 384, 1, 169] + - [432, 6675.02] + - - [1152, 256, 1, 196] + - [425, 3211.26] + - - [1728, 192, 1, 1225] + - [426, 4852.26] + - - [2048, 512, 1, 49] + - [438, 3471.64] + - - [576, 96, 1, 1225] + - [469, 2176.66] + - - [512, 2048, 1, 49] + - [420, 3845.83] + - - [1728, 192, 1, 64] + - [415, 2369.83] + - - [832, 256, 1, 49] + - [445, 1433.6] + - - [512, 128, 1, 196] + - [470, 1459.67] + - - [1200, 128, 1, 49] + - [465, 1069.09] + - - [528, 256, 1, 196] + - [453, 2069.76] + - - [256, 512, 1, 784] + - [476, 4538.89] + - - [480, 192, 1, 196] + - [476, 1792.0] + - - [96, 64, 36, 2592] + - [483, 4845.41] + - - [96, 96, 36, 2592] + - [488, 5111.53] + - - [1024, 192, 1, 289] + - [450, 3431.14] + - - [1536, 384, 1, 64] + - [421, 3166.84] + - - [192, 96, 1, 784] + - [461, 881.14] + - - [2048, 192, 1, 64] + - [418, 2330.17] + - - [192, 64, 1, 1225] + - [500, 1100.35] + - - [512, 32, 1, 196] + - [491, 477.867] + - - [128, 96, 36, 1568] + - [487, 6649.09] + - - [528, 128, 1, 196] + - [473, 1403.23] + - - [128, 512, 1, 784] + - [463, 2237.81] + - - [128, 128, 36, 3136] + - [480, 6538.77] + - - [528, 160, 1, 196] + - [477, 1642.67] + - - [448, 64, 1, 5329] + - [452, 3264.81] + - - [1280, 320, 1, 64] + - [416, 2776.95] + - - [1792, 320, 1, 289] + - [428, 5204.9] + - - [2880, 320, 1, 64] + - [424, 4336.94] + - - [147, 64, 1, 12544] + - [505, 2430.27] + - - [4096, 512, 1, 1001] + - [440, 9618.99] + - - [1536, 32, 1, 1001] + - [477, 1757.18] + - - [512, 160, 1, 196] + - [473, 1592.89] + - - [768, 160, 1, 289] + - [474, 2757.17] + - - [1728, 384, 1, 49] + - [426, 3102.49] + - - [64, 32, 36, 43808] + - [481, 2626.43] + - - [64, 64, 1, 3136] + - [493, 610.506] + - - [256, 32, 1, 784] + - [494, 612.837] + - - [480, 96, 1, 196] + - [469, 1055.1] + - - [1024, 32, 1, 1001] + - [459, 1188.43] + - - [832, 160, 1, 49] + - [474, 959.247] + - - [512, 1024, 1, 196] + - [417, 4978.7] + - - [96, 64, 36, 10368] + - [511, 5000.95] + - - [384, 448, 36, 512] + - [516, 8903.0] + - - [2048, 64, 1, 1001] + - [509, 4385.13] + - - [224, 192, 36, 5184] + - [515, 7487.81] + - - [2048, 128, 1, 1001] + - [508, 5764.63] + - - [96, 96, 36, 10368] + - [517, 5275.21] + - - [192, 80, 36, 20736] + - [513, 5409.4] + - - [96, 64, 36, 5184] + - [511, 4911.83] + - - [1536, 64, 1, 1001] + - [510, 3162.03] + - - [96, 64, 36, 20736] + - [512, 5034.33] + - - [384, 448, 36, 256] + - [514, 8815.87] + - - [96, 96, 36, 5184] + - [518, 5236.02] - - [1024, 128, 1, 128] - - [425, 896.319] + - [531, 896.319] - - [4, 704, 1, 1280] - - [462, 328.976] + - [568, 328.976] - - [4, 1856, 1, 3328] - - [472, 501.461] + - [578, 501.461] - - [1856, 448, 1, 3328] - - [517, 5678.01] + - [623, 5678.01] - - [2944, 4288, 1, 1280] - - [503, 8412.49] + - [609, 8412.49] - - [2368, 64, 1, 3328] - - [453, 4914.02] + - [559, 4914.02] - - [1760, 32, 1, 1760] - - [480, 3313.04] + - [586, 3313.04] - - [2368, 5888, 1, 256] - - [503, 6489.82] + - [609, 6489.82] - - [5888, 1856, 1, 256] - - [515, 7791.98] + - [621, 7791.98] - - [128, 64, 1, 256] - - [487, 369.317] + - [593, 369.317] - - [512, 24000, 1, 1536] - - [509, 8827.47] + - [615, 8827.47] - - [128, 6784, 1, 3328] - - [509, 6537.09] + - [615, 6537.09] - - [5888, 1408, 1, 256] - - [523, 6129.71] + - [629, 6129.71] - - [5888, 1856, 1, 3328] - - [509, 7969.27] + - [615, 7969.27] - - [5056, 704, 1, 256] - - [509, 6723.92] + - [615, 6723.92] - - [2048, 400, 1, 512] - - [515, 4531.54] + - [621, 4531.54] - - [5888, 2944, 1, 3328] - - [515, 8608.14] + - [621, 8608.14] - - [1856, 4288, 1, 256] - - [515, 6297.64] + - [621, 6297.64] - - [1024, 5056, 1, 128] - - [493, 3595.47] + - [599, 3595.47] - - [5056, 5056, 1, 3328] - - [509, 8559.26] + - [615, 8559.26] - - [1408, 5888, 1, 1280] - - [504, 6797.16] + - [610, 6797.16] - - [2368, 448, 1, 128] - - [493, 2815.0] + - [599, 2815.0] - - [2368, 6784, 1, 128] - - [497, 4782.08] + - [603, 4782.08] - - [1024, 3584, 1, 3328] - - [505, 8402.54] + - [611, 8402.54] - - [512, 48000, 1, 2048] - - [509, 8162.33] + - [615, 8162.33] - - [128, 448, 1, 1280] - - [480, 2903.59] + - [586, 2903.59] - - [256, 4288, 1, 3328] - - [510, 6346.04] + - [616, 6346.04] - - [5888, 1408, 1, 1280] - - [509, 8959.55] + - [615, 8959.55] - - [704, 1856, 1, 3328] - - [504, 6955.37] + - [610, 6955.37] - - [4, 1408, 1, 128] - - [524, 60.1747] + - [630, 60.1747] - - [1024, 2368, 1, 256] - - [511, 5927.88] + - [617, 5927.88] - - [64, 4, 1, 256] - - [529, 13.3129] + - [635, 13.3129] - - [1408, 1856, 1, 1280] - - [507, 8051.68] + - [613, 8051.68] - - [1408, 64, 1, 1280] - - [483, 3400.55] + - [589, 3400.55] - - [448, 1024, 1, 1280] - - [511, 5730.02] + - [617, 5730.02] - - [6144, 24000, 1, 2048] - - [515, 7738.4] + - [621, 7738.4] - - [4096, 32, 1, 4096] - - [453, 2381.53] + - [559, 2381.53] - - [256, 1408, 1, 3328] - - [511, 4844.88] + - [617, 4844.88] - - [5056, 5056, 1, 1280] - - [515, 9090.2] + - [621, 9090.2] - - [448, 5056, 1, 256] - - [521, 4961.28] + - [627, 4961.28] - - [704, 1856, 1, 1280] - - [507, 6456.54] + - [613, 6456.54] - - [128, 5056, 1, 128] - - [436, 2251.12] + - [542, 2251.12] - - [2368, 128, 1, 256] - - [504, 3403.37] + - [610, 3403.37] - - [1760, 6400, 1, 1760] - - [503, 8959.8] + - [609, 8959.8] - - [1856, 1408, 1, 128] - - [496, 3493.16] + - [602, 3493.16] - - [64, 5056, 1, 256] - - [505, 2582.32] + - [611, 2582.32] - - [6784, 256, 1, 3328] - - [503, 7323.64] + - [609, 7323.64] - - [6784, 4288, 1, 3328] - - [505, 8542.19] + - [611, 8542.19] - - [4288, 448, 1, 256] - - [521, 5030.6] + - [627, 5030.6] - - [64, 704, 1, 128] - - [438, 375.567] + - [544, 375.567] - - [1856, 2368, 1, 3328] - - [514, 6742.44] + - [620, 6742.44] - - [4288, 2944, 1, 1280] - - [515, 8578.27] + - [621, 8578.27] - - [704, 5056, 1, 1280] - - [511, 8014.55] + - [617, 8014.55] - - [2368, 704, 1, 3328] - - [510, 6544.41] + - [616, 6544.41] - - [256, 5888, 1, 256] - - [508, 5933.0] + - [614, 5933.0] - - [1856, 4288, 1, 3328] - - [514, 7410.82] + - [620, 7410.82] - - [256, 2944, 1, 256] - - [510, 5014.08] + - [616, 5014.08] - - [5888, 1024, 1, 256] - - [515, 8069.44] + - [621, 8069.44] - - [448, 64, 1, 1280] - - [490, 2057.28] + - [596, 2057.28] - - [3072, 64, 1, 1024] - - [470, 2145.52] + - [576, 2145.52] - - [3584, 4, 1, 1280] - - [462, 498.743] + - [568, 498.743] - - [16384, 3200, 1, 4096] - - [502, 6621.53] + - [608, 6621.53] - - [2944, 64, 1, 256] - - [510, 2554.89] + - [616, 2554.89] - - [128, 4, 1, 1280] - - [472, 87.2489] + - [578, 87.2489] - - [1408, 2944, 1, 256] - - [509, 8029.45] + - [615, 8029.45] - - [256, 1856, 1, 1280] - - [504, 6170.7] + - [610, 6170.7] - - [6784, 5056, 1, 3328] - - [513, 7134.29] + - [619, 7134.29] - - [5056, 5056, 1, 256] - - [521, 6246.9] + - [627, 6246.9] - - [1408, 6784, 1, 128] - - [498, 4329.55] + - [604, 4329.55] - - [64, 1024, 1, 1280] - - [480, 3206.75] + - [586, 3206.75] - - [2944, 4, 1, 256] - - [529, 333.58] + - [635, 333.58] - - [704, 5056, 1, 128] - - [493, 4085.52] + - [599, 4085.52] - - [4, 2368, 1, 1280] - - [530, 394.767] + - [636, 394.767] - - [2368, 2944, 1, 1280] - - [509, 8634.05] + - [615, 8634.05] - - [128, 3584, 1, 1280] - - [510, 6046.25] + - [616, 6046.25] - - [6784, 6784, 1, 1280] - - [515, 8847.51] + - [621, 8847.51] - - [1408, 4288, 1, 1280] - - [515, 8236.79] + - [621, 8236.79] - - [3584, 4288, 1, 1280] - - [510, 7399.98] + - [616, 7399.98] - - [2368, 704, 1, 1280] - - [503, 6754.5] + - [609, 6754.5] - - [5056, 4288, 1, 3328] - - [509, 8569.63] + - [615, 8569.63] - - [3584, 2368, 1, 3328] - - [514, 7942.48] + - [620, 7942.48] - - [64, 704, 1, 1280] - - [483, 2363.69] + - [589, 2363.69] - - [4288, 256, 1, 256] - - [511, 4591.9] + - [617, 4591.9] - - [2944, 128, 1, 128] - - [436, 1878.39] + - [542, 1878.39] - - [6144, 32, 1, 2560] - - [481, 3334.2] + - [587, 3334.2] - - [6784, 448, 1, 1280] - - [513, 7939.3] + - [619, 7939.3] - - [1408, 2944, 1, 128] - - [497, 4096.61] + - [603, 4096.61] - - [4288, 2944, 1, 256] - - [503, 8141.23] + - [609, 8141.23] - - [5888, 704, 1, 1280] - - [504, 7516.23] + - [610, 7516.23] - - [5056, 4, 1, 3328] - - [447, 552.509] + - [553, 552.509] - - [1856, 64, 1, 1280] - - [453, 3870.86] + - [559, 3870.86] - - [1760, 16, 1, 1760] - - [465, 2181.51] + - [571, 2181.51] - - [448, 5888, 1, 128] - - [498, 3371.1] + - [604, 3371.1] - - [5888, 64, 1, 3328] - - [478, 5319.48] + - [584, 5319.48] - - [2944, 256, 1, 3328] - - [510, 7122.4] + - [616, 7122.4] - - [1024, 64, 1, 128] - - [425, 595.882] + - [531, 595.882] - - [5056, 2368, 1, 1280] - - [504, 7778.29] + - [610, 7778.29] - - [448, 3584, 1, 1280] - - [509, 6500.62] + - [615, 6500.62] - - [6784, 5888, 1, 256] - - [509, 8918.68] + - [615, 8918.68] - - [704, 1024, 1, 128] - - [493, 2627.51] + - [599, 2627.51] - - [704, 128, 1, 1280] - - [480, 3408.59] + - [586, 3408.59] - - [4, 3584, 1, 128] - - [524, 140.821] + - [630, 140.821] - - [1408, 448, 1, 1280] - - [504, 5881.54] + - [610, 5881.54] - - [1024, 1408, 1, 256] - - [508, 5647.27] + - [614, 5647.27] - - [2368, 2368, 1, 3328] - - [502, 7688.83] + - [608, 7688.83] - - [1856, 6784, 1, 128] - - [493, 4705.95] + - [599, 4705.95] - - [5056, 704, 1, 3328] - - [513, 8198.98] + - [619, 8198.98] - - [1408, 1856, 1, 256] - - [515, 6340.05] + - [621, 6340.05] - - [1408, 704, 1, 3328] - - [507, 7599.65] + - [613, 7599.65] - - [2368, 5056, 1, 256] - - [515, 8242.85] + - [621, 8242.85] - - [1408, 256, 1, 1280] - - [510, 4879.26] + - [616, 4879.26] - - [3072, 128, 1, 1024] - - [479, 2525.52] + - [585, 2525.52] - - [3584, 2368, 1, 1280] - - [511, 8132.72] + - [617, 8132.72] - - [4288, 64, 1, 3328] - - [466, 5156.53] + - [572, 5156.53] - - [2368, 4, 1, 1280] - - [528, 482.75] + - [634, 482.75] - - [704, 5888, 1, 256] - - [518, 5398.75] + - [624, 5398.75] - - [6784, 2944, 1, 128] - - [494, 4748.99] + - [600, 4748.99] - - [2560, 1600, 1, 2560] - - [505, 7355.0] + - [611, 7355.0] - - [4288, 6784, 1, 3328] - - [502, 7409.41] + - [608, 7409.41] - - [2944, 256, 1, 256] - - [510, 5077.42] + - [616, 5077.42] - - [2944, 6784, 1, 3328] - - [515, 8068.05] + - [621, 8068.05] - - [704, 1408, 1, 3328] - - [510, 7239.43] + - [616, 7239.43] - - [6144, 5984, 1, 2048] - - [509, 7176.07] + - [615, 7176.07] - - [3584, 704, 1, 3328] - - [515, 6642.86] + - [621, 6642.86] - - [2944, 256, 1, 128] - - [494, 2644.54] + - [600, 2644.54] - - [6784, 4, 1, 1280] - - [526, 402.487] + - [632, 402.487] - - [1024, 64, 1, 1280] - - [480, 2602.03] + - [586, 2602.03] - - [2048, 1600, 1, 512] - - [507, 5592.5] + - [613, 5592.5] - - [448, 4288, 1, 256] - - [505, 6128.99] + - [611, 6128.99] - - [64, 3584, 1, 3328] - - [446, 5534.93] + - [552, 5534.93] - - [1856, 4288, 1, 128] - - [496, 4400.11] + - [602, 4400.11] - - [704, 2368, 1, 1280] - - [521, 5735.02] + - [627, 5735.02] - - [1856, 2368, 1, 1280] - - [518, 6482.4] + - [624, 6482.4] - - [2368, 128, 1, 3328] - - [491, 4717.32] + - [597, 4717.32] - - [2944, 128, 1, 256] - - [518, 3276.9] + - [624, 3276.9] - - [448, 1408, 1, 256] - - [510, 4852.28] + - [616, 4852.28] - - [1856, 4288, 1, 1280] - - [505, 8132.96] + - [611, 8132.96] - - [64, 5056, 1, 3328] - - [481, 5097.06] + - [587, 5097.06] - - [4, 704, 1, 256] - - [528, 128.831] + - [634, 128.831] - - [1024, 448, 1, 128] - - [493, 1816.94] + - [599, 1816.94] - - [704, 4, 1, 1280] - - [529, 328.976] + - [635, 328.976] - - [704, 256, 1, 128] - - [497, 876.569] + - [603, 876.569] - - [704, 2944, 1, 128] - - [497, 3734.47] + - [603, 3734.47] - - [1408, 1024, 1, 1280] - - [505, 7224.85] + - [611, 7224.85] - - [704, 6784, 1, 256] - - [509, 7354.77] + - [615, 7354.77] - - [6784, 704, 1, 256] - - [505, 6012.28] + - [611, 6012.28] - - [5056, 1408, 1, 128] - - [498, 4311.28] + - [604, 4311.28] - - [2048, 7000, 1, 2048] - - [509, 7232.07] + - [615, 7232.07] - - [256, 3584, 1, 3328] - - [513, 7006.0] + - [619, 7006.0] - - [4, 5888, 1, 3328] - - [531, 534.612] + - [637, 534.612] - - [128, 1408, 1, 128] - - [423, 1177.07] + - [529, 1177.07] - - [3584, 4288, 1, 3328] - - [515, 7135.0] + - [621, 7135.0] - - [5888, 1856, 1, 1280] - - [503, 8395.03] + - [609, 8395.03] - - [256, 1408, 1, 256] - - [504, 3977.46] + - [610, 3977.46] - - [5056, 64, 1, 1280] - - [504, 4257.78] + - [610, 4257.78] - - [1024, 704, 1, 256] - - [504, 5036.93] + - [610, 5036.93] - - [448, 128, 1, 128] - - [425, 533.533] + - [531, 533.533] - - [2368, 3584, 1, 1280] - - [509, 8272.43] + - [615, 8272.43] - - [2368, 6784, 1, 1280] - - [502, 8288.24] + - [608, 8288.24] - - [1856, 4, 1, 1280] - - [442, 464.1] + - [548, 464.1] - - [448, 448, 1, 256] - - [504, 3058.45] + - [610, 3058.45] - - [2944, 3584, 1, 3328] - - [509, 8557.63] + - [615, 8557.63] - - [7680, 32, 1, 2560] - - [481, 3729.03] + - [587, 3729.03] - - [128, 4288, 1, 128] - - [424, 2116.2] + - [530, 2116.2] - - [256, 256, 1, 3328] - - [480, 4051.06] + - [586, 4051.06] - - [128, 1024, 1, 3328] - - [453, 5139.21] + - [559, 5139.21] - - [4, 1408, 1, 3328] - - [472, 502.871] + - [578, 502.871] - - [6784, 2944, 1, 256] - - [503, 8446.06] + - [609, 8446.06] - - [64, 1856, 1, 1280] - - [445, 3870.86] + - [551, 3870.86] - - [6784, 64, 1, 128] - - [493, 1877.62] + - [599, 1877.62] - - [4288, 2368, 1, 3328] - - [513, 8419.4] + - [619, 8419.4] - - [1856, 2368, 1, 256] - - [507, 6887.48] + - [613, 6887.48] - - [3584, 256, 1, 128] - - [497, 2496.71] + - [603, 2496.71] - - [3584, 6784, 1, 3328] - - [509, 7626.18] + - [615, 7626.18] - - [256, 1024, 1, 256] - - [510, 3095.53] + - [616, 3095.53] - - [4, 6784, 1, 3328] - - [472, 589.274] + - [578, 589.274] - - [1024, 5888, 1, 3328] - - [509, 7794.35] + - [615, 7794.35] - - [1024, 128, 1, 1280] - - [482, 3130.18] + - [588, 3130.18] - - [3072, 32, 1, 1024] - - [469, 1675.59] + - [575, 1675.59] - - [6144, 24000, 1, 2560] - - [509, 7256.14] + - [615, 7256.14] - - [5056, 4288, 1, 1280] - - [507, 8349.03] + - [613, 8349.03] - - [5888, 64, 1, 256] - - [456, 2593.35] + - [562, 2593.35] - - [6784, 1856, 1, 3328] - - [503, 8087.38] + - [609, 8087.38] - - [1408, 5056, 1, 1280] - - [505, 7802.63] + - [611, 7802.63] - - [1856, 256, 1, 1280] - - [510, 6150.73] + - [616, 6150.73] - - [64, 5888, 1, 3328] - - [477, 5301.49] + - [583, 5301.49] - - [2368, 2368, 1, 1280] - - [507, 8233.43] + - [613, 8233.43] - - [2944, 5888, 1, 128] - - [500, 3745.51] + - [606, 3745.51] - - [704, 5888, 1, 1280] - - [505, 8245.04] + - [611, 8245.04] - - [2368, 3584, 1, 128] - - [497, 4523.43] + - [603, 4523.43] - - [1856, 5056, 1, 128] - - [494, 4498.08] + - [600, 4498.08] - - [704, 1024, 1, 1280] - - [518, 5479.59] + - [624, 5479.59] - - [448, 256, 1, 3328] - - [461, 5048.8] + - [567, 5048.8] - - [448, 1856, 1, 128] - - [494, 2936.92] + - [600, 2936.92] - - [8192, 3200, 1, 2048] - - [503, 6713.12] + - [609, 6713.12] - - [128, 1024, 1, 128] - - [439, 998.744] + - [545, 998.744] - - [2944, 4, 1, 128] - - [524, 98.7471] + - [630, 98.7471] - - [1024, 704, 1, 1280] - - [510, 5897.0] + - [616, 5897.0] - - [128, 5888, 1, 256] - - [510, 5014.08] + - [616, 5014.08] - - [1024, 5056, 1, 1280] - - [509, 8857.81] + - [615, 8857.81] - - [4288, 1024, 1, 256] - - [515, 6195.39] + - [621, 6195.39] - - [2944, 2368, 1, 128] - - [493, 4442.23] + - [599, 4442.23] - - [704, 704, 1, 3328] - - [510, 6764.4] + - [616, 6764.4] - - [704, 1408, 1, 1280] - - [511, 7383.58] + - [617, 7383.58] - - [5888, 448, 1, 1280] - - [509, 7299.49] + - [615, 7299.49] - - [3584, 256, 1, 3328] - - [507, 7061.72] + - [613, 7061.72] - - [704, 5888, 1, 3328] - - [511, 8142.42] + - [617, 8142.42] - - [704, 1856, 1, 128] - - [497, 3139.14] + - [603, 3139.14] - - [448, 448, 1, 3328] - - [475, 5063.34] + - [581, 5063.34] - - [4, 4288, 1, 128] - - [525, 64.9775] + - [631, 64.9775] - - [128, 704, 1, 1280] - - [445, 3400.55] + - [551, 3400.55] - - [3584, 2944, 1, 256] - - [515, 7982.14] + - [621, 7982.14] - - [3584, 4, 1, 128] - - [524, 105.318] + - [630, 105.318] - - [1856, 128, 1, 3328] - - [476, 5442.19] + - [582, 5442.19] - - [4, 64, 1, 1280] - - [530, 42.3268] + - [636, 42.3268] - - [2944, 448, 1, 128] - - [493, 2926.95] + - [599, 2926.95] - - [128, 2944, 1, 1280] - - [504, 5109.69] + - [610, 5109.69] - - [64, 64, 1, 3328] - - [472, 1252.99] + - [578, 1252.99] - - [448, 2944, 1, 1280] - - [513, 6684.47] + - [619, 6684.47] - - [512, 24000, 1, 2048] - - [509, 7939.03] + - [615, 7939.03] - - [128, 256, 1, 3328] - - [490, 3276.9] + - [596, 3276.9] - - [1408, 5056, 1, 3328] - - [515, 8959.21] + - [621, 8959.21] - - [1856, 1856, 1, 3328] - - [505, 8006.17] + - [611, 8006.17] - - [3584, 128, 1, 256] - - [510, 4292.52] + - [616, 4292.52] - - [2560, 800, 1, 2560] - - [505, 6262.48] + - [611, 6262.48] - - [448, 1408, 1, 3328] - - [521, 4997.35] + - [627, 4997.35] - - [2368, 2368, 1, 256] - - [523, 4978.94] + - [629, 4978.94] - - [4288, 4288, 1, 1280] - - [502, 8617.78] + - [608, 8617.78] - - [64, 448, 1, 1280] - - [448, 2057.28] + - [554, 2057.28] - - [5888, 1024, 1, 1280] - - [520, 6848.17] + - [626, 6848.17] - - [1408, 4288, 1, 256] - - [503, 7077.01] + - [609, 7077.01] - - [448, 4, 1, 256] - - [528, 84.4294] + - [634, 84.4294] - - [5888, 448, 1, 128] - - [497, 3493.91] + - [603, 3493.91] - - [512, 48000, 1, 2560] - - [515, 8960.13] + - [621, 8960.13] - - [35, 8457, 1, 1760] - - [417, 3934.78] + - [523, 3934.78] - - [704, 6784, 1, 3328] - - [502, 8180.88] + - [608, 8180.88] - - [2560, 6400, 1, 2560] - - [503, 7822.24] + - [609, 7822.24] - - [5056, 1024, 1, 1280] - - [505, 8357.38] + - [611, 8357.38] - - [448, 5888, 1, 3328] - - [509, 7505.28] + - [615, 7505.28] - - [128, 4, 1, 128] - - [524, 0.662251] + - [630, 0.662251] - - [1024, 2944, 1, 1280] - - [509, 8406.24] + - [615, 8406.24] - - [5056, 5888, 1, 1280] - - [509, 8819.76] + - [615, 8819.76] - - [4288, 5888, 1, 128] - - [494, 3522.32] + - [600, 3522.32] - - [256, 3584, 1, 256] - - [505, 5883.89] + - [611, 5883.89] - - [1408, 3584, 1, 128] - - [493, 4283.41] + - [599, 4283.41] - - [256, 2944, 1, 3328] - - [513, 5670.63] + - [619, 5670.63] - - [448, 3584, 1, 128] - - [497, 3171.72] + - [603, 3171.72] - - [5888, 2944, 1, 1280] - - [515, 8198.86] + - [621, 8198.86] - - [4, 6784, 1, 1280] - - [462, 553.896] + - [568, 553.896] - - [2368, 5888, 1, 128] - - [493, 4787.32] + - [599, 4787.32] - - [8448, 16, 1, 2816] - - [452, 2452.63] + - [558, 2452.63] - - [64, 2944, 1, 128] - - [425, 1376.66] + - [531, 1376.66] - - [2368, 4, 1, 256] - - [447, 278.177] + - [553, 278.177] - - [3584, 5888, 1, 256] - - [523, 6233.66] + - [629, 6233.66] - - [2368, 1024, 1, 128] - - [494, 3781.51] + - [600, 3781.51] - - [2368, 704, 1, 128] - - [494, 3198.32] + - [600, 3198.32] - - [3584, 2944, 1, 1280] - - [505, 8045.68] + - [611, 8045.68] - - [3584, 2368, 1, 128] - - [494, 4188.57] + - [600, 4188.57] - - [5056, 704, 1, 128] - - [497, 4019.21] + - [603, 4019.21] - - [448, 2368, 1, 128] - - [499, 2522.21] + - [605, 2522.21] - - [5056, 1408, 1, 3328] - - [507, 8349.93] + - [613, 8349.93] - - [1408, 704, 1, 256] - - [513, 4741.42] + - [619, 4741.42] - - [6784, 1024, 1, 3328] - - [515, 8769.5] + - [621, 8769.5] - - [6784, 2944, 1, 3328] - - [512, 7319.74] + - [618, 7319.74] - - [2944, 5056, 1, 3328] - - [502, 8889.76] + - [608, 8889.76] - - [1856, 1856, 1, 256] - - [505, 6309.84] + - [611, 6309.84] - - [1024, 5888, 1, 128] - - [496, 3759.6] + - [602, 3759.6] - - [6784, 2368, 1, 1280] - - [505, 8298.4] + - [611, 8298.4] - - [256, 4, 1, 128] - - [524, 7.10171] + - [630, 7.10171] - - [4288, 5888, 1, 1280] - - [509, 8365.28] + - [615, 8365.28] - - [4288, 4288, 1, 256] - - [509, 6513.78] + - [615, 6513.78] - - [8448, 32, 1, 2816] - - [480, 4257.74] + - [586, 4257.74] - - [448, 2944, 1, 3328] - - [513, 6875.62] + - [619, 6875.62] - - [5888, 4, 1, 128] - - [524, 163.94] + - [630, 163.94] - - [4288, 1856, 1, 1280] - - [509, 8402.91] + - [615, 8402.91] - - [1856, 2944, 1, 3328] - - [509, 6612.21] + - [615, 6612.21] - - [256, 6784, 1, 3328] - - [510, 7358.7] + - [616, 7358.7] - - [64, 5888, 1, 256] - - [504, 3359.05] + - [610, 3359.05] - - [256, 5056, 1, 128] - - [497, 2489.21] + - [603, 2489.21] - - [5056, 1024, 1, 256] - - [515, 8077.87] + - [621, 8077.87] - - [704, 64, 1, 3328] - - [459, 3288.4] + - [565, 3288.4] - - [5056, 1856, 1, 3328] - - [513, 8171.13] + - [619, 8171.13] - - [4, 2944, 1, 3328] - - [472, 546.843] + - [578, 546.843] - - [4, 5056, 1, 256] - - [447, 378.561] + - [553, 378.561] - - [1856, 1408, 1, 256] - - [515, 6320.88] + - [621, 6320.88] - - [8448, 12000, 1, 2816] - - [513, 7365.87] + - [619, 7365.87] - - [6784, 128, 1, 3328] - - [510, 6366.57] + - [616, 6366.57] - - [4288, 1408, 1, 128] - - [493, 4451.7] + - [599, 4451.7] - - [1856, 5888, 1, 3328] - - [511, 8619.76] + - [617, 8619.76] - - [4288, 5056, 1, 256] - - [515, 7289.05] + - [621, 7289.05] - - [1408, 128, 1, 1280] - - [453, 4291.15] + - [559, 4291.15] - - [4096, 800, 1, 1024] - - [504, 5867.89] + - [610, 5867.89] - - [5056, 256, 1, 3328] - - [510, 7527.61] + - [616, 7527.61] - - [704, 704, 1, 256] - - [510, 4417.85] + - [616, 4417.85] - - [1024, 5888, 1, 1280] - - [515, 8674.57] + - [621, 8674.57] - - [6784, 2368, 1, 128] - - [493, 4724.08] + - [599, 4724.08] - - [4, 5056, 1, 1280] - - [462, 540.307] + - [568, 540.307] - - [256, 64, 1, 1280] - - [464, 1515.38] + - [570, 1515.38] - - [128, 1856, 1, 1280] - - [504, 4574.21] + - [610, 4574.21] - - [1856, 1024, 1, 1280] - - [509, 7741.61] + - [615, 7741.61] - - [6784, 4288, 1, 1280] - - [515, 8521.29] + - [621, 8521.29] - - [2560, 64, 1, 2560] - - [446, 3504.7] + - [552, 3504.7] - - [1856, 1856, 1, 1280] - - [505, 7779.31] + - [611, 7779.31] - - [4096, 400, 1, 1024] - - [515, 4157.81] + - [621, 4157.81] - - [3072, 24000, 1, 1024] - - [515, 8663.45] + - [621, 8663.45] - - [128, 4288, 1, 3328] - - [461, 5674.23] + - [567, 5674.23] - - [4, 2368, 1, 3328] - - [472, 525.48] + - [578, 525.48] - - [5888, 1856, 1, 128] - - [497, 4099.74] + - [603, 4099.74] - - [448, 704, 1, 1280] - - [510, 4309.47] + - [616, 4309.47] - - [128, 5056, 1, 1280] - - [453, 5068.46] + - [559, 5068.46] - - [1024, 448, 1, 3328] - - [513, 6077.82] + - [619, 6077.82] - - [1856, 704, 1, 1280] - - [521, 6257.49] + - [627, 6257.49] - - [5056, 3584, 1, 128] - - [494, 4598.52] + - [600, 4598.52] - - [5888, 5888, 1, 3328] - - [515, 8058.25] + - [621, 8058.25] - - [6784, 1024, 1, 256] - - [515, 5120.99] + - [621, 5120.99] - - [2944, 2368, 1, 256] - - [506, 6523.03] + - [612, 6523.03] - - [256, 448, 1, 256] - - [456, 1816.94] + - [562, 1816.94] - - [5056, 5888, 1, 3328] - - [508, 6722.41] + - [614, 6722.41] - - [1856, 1024, 1, 256] - - [515, 6632.31] + - [621, 6632.31] - - [512, 48000, 1, 1536] - - [509, 8556.01] + - [615, 8556.01] - - [3584, 448, 1, 1280] - - [504, 6567.09] + - [610, 6567.09] - - [8448, 5984, 1, 2816] - - [509, 8990.66] + - [615, 8990.66] - - [448, 5888, 1, 256] - - [509, 6220.47] + - [615, 6220.47] - - [704, 64, 1, 128] - - [422, 450.66] + - [528, 450.66] - - [1408, 6784, 1, 3328] - - [502, 8478.68] + - [608, 8478.68] - - [448, 1024, 1, 128] - - [501, 1844.33] + - [607, 1844.33] - - [4288, 704, 1, 128] - - [497, 3895.26] + - [603, 3895.26] - - [128, 1856, 1, 128] - - [428, 1456.46] + - [534, 1456.46] - - [448, 2368, 1, 3328] - - [507, 5538.04] + - [613, 5538.04] - - [5056, 64, 1, 128] - - [493, 1648.94] + - [599, 1648.94] - - [5056, 2944, 1, 256] - - [509, 8230.87] + - [615, 8230.87] - - [6784, 5888, 1, 128] - - [493, 4873.19] + - [599, 4873.19] - - [1024, 700, 1, 512] - - [507, 4445.37] + - [613, 4445.37] - - [704, 1024, 1, 256] - - [505, 4707.99] + - [611, 4707.99] - - [1024, 4, 1, 256] - - [447, 174.863] + - [553, 174.863] - - [2944, 704, 1, 128] - - [497, 3483.42] + - [603, 3483.42] - - [128, 6784, 1, 1280] - - [505, 6522.93] + - [611, 6522.93] - - [1408, 3584, 1, 3328] - - [509, 8673.59] + - [615, 8673.59] - - [2368, 6784, 1, 256] - - [505, 7941.76] + - [611, 7941.76] - - [5056, 1408, 1, 1280] - - [509, 8801.01] + - [615, 8801.01] - - [256, 256, 1, 128] - - [434, 551.982] + - [540, 551.982] - - [5056, 4288, 1, 128] - - [501, 3793.64] + - [607, 3793.64] - - [1408, 1856, 1, 128] - - [493, 3067.74] + - [599, 3067.74] - - [1408, 5888, 1, 3328] - - [509, 9148.97] + - [615, 9148.97] - - [1856, 256, 1, 256] - - [505, 4319.52] + - [611, 4319.52] - - [6784, 6784, 1, 256] - - [505, 7668.53] + - [611, 7668.53] - - [64, 256, 1, 128] - - [439, 131.172] + - [545, 131.172] - - [4288, 2368, 1, 128] - - [494, 4582.99] + - [600, 4582.99] - - [256, 4288, 1, 1280] - - [504, 6058.61] + - [610, 6058.61] - - [2368, 2944, 1, 256] - - [509, 8016.07] + - [615, 8016.07] - - [4, 1856, 1, 256] - - [526, 252.832] + - [632, 252.832] - - [3584, 1856, 1, 1280] - - [505, 7760.24] + - [611, 7760.24] - - [6784, 6784, 1, 128] - - [494, 4970.14] + - [600, 4970.14] - - [256, 1856, 1, 128] - - [500, 1580.59] + - [606, 1580.59] - - [704, 64, 1, 1280] - - [489, 2556.47] + - [595, 2556.47] - - [5888, 5056, 1, 256] - - [509, 8216.67] + - [615, 8216.67] - - [8448, 48000, 1, 2816] - - [515, 4082.89] + - [621, 4082.89] - - [3584, 448, 1, 256] - - [509, 5518.92] + - [615, 5518.92] - - [448, 4288, 1, 128] - - [497, 3415.25] + - [603, 3415.25] - - [7680, 64, 1, 2560] - - [458, 5162.1] + - [564, 5162.1] - - [256, 6784, 1, 256] - - [509, 6272.62] + - [615, 6272.62] - - [1408, 4288, 1, 128] - - [497, 4343.63] + - [603, 4343.63] - - [2944, 704, 1, 3328] - - [504, 7679.71] + - [610, 7679.71] - - [128, 448, 1, 256] - - [444, 1422.59] + - [550, 1422.59] - - [5056, 256, 1, 1280] - - [511, 5052.39] + - [617, 5052.39] - - [2560, 32, 1, 2560] - - [467, 3106.07] + - [573, 3106.07] - - [3584, 3584, 1, 256] - - [515, 8260.57] + - [621, 8260.57] - - [448, 1408, 1, 128] - - [493, 2397.38] + - [599, 2397.38] - - [128, 256, 1, 1280] - - [448, 2340.67] + - [554, 2340.67] - - [3584, 5056, 1, 256] - - [515, 7347.56] + - [621, 7347.56] - - [6784, 128, 1, 256] - - [505, 5591.1] + - [611, 5591.1] - - [4288, 4, 1, 256] - - [447, 354.206] + - [553, 354.206] - - [704, 448, 1, 256] - - [510, 3492.33] + - [616, 3492.33] - - [2944, 2368, 1, 1280] - - [517, 6661.71] + - [623, 6661.71] - - [448, 64, 1, 3328] - - [489, 3058.45] + - [595, 3058.45] - - [1408, 3584, 1, 256] - - [515, 7966.59] + - [621, 7966.59] - - [3584, 4, 1, 3328] - - [528, 605.559] + - [634, 605.559] - - [6784, 3584, 1, 256] - - [505, 7525.41] + - [611, 7525.41] - - [256, 128, 1, 128] - - [437, 276.041] + - [543, 276.041] - - [704, 1408, 1, 128] - - [494, 3109.85] + - [600, 3109.85] - - [4, 2368, 1, 256] - - [528, 283.375] + - [634, 283.375] - - [4288, 128, 1, 1280] - - [510, 5132.65] + - [616, 5132.65] - - [128, 1408, 1, 256] - - [504, 2733.35] + - [610, 2733.35] - - [4, 2944, 1, 256] - - [526, 314.127] + - [632, 314.127] - - [64, 128, 1, 3328] - - [474, 1514.71] + - [580, 1514.71] - - [5056, 2368, 1, 128] - - [498, 3449.17] + - [604, 3449.17] - - [2944, 2944, 1, 3328] - - [502, 8169.03] + - [608, 8169.03] - - [5056, 6784, 1, 256] - - [522, 5792.77] + - [628, 5792.77] - - [1856, 3584, 1, 128] - - [499, 4213.5] + - [605, 4213.5] - - [128, 2944, 1, 128] - - [423, 1970.46] + - [529, 1970.46] - - [35, 8457, 1, 2560] - - [418, 3525.15] + - [524, 3525.15] - - [1024, 704, 1, 3328] - - [504, 6784.99] + - [610, 6784.99] - - [6784, 448, 1, 256] - - [513, 6544.88] + - [619, 6544.88] - - [3584, 6784, 1, 128] - - [493, 4623.6] + - [599, 4623.6] - - [128, 4288, 1, 256] - - [507, 3606.6] + - [613, 3606.6] - - [704, 448, 1, 3328] - - [504, 4478.01] + - [610, 4478.01] - - [128, 128, 1, 3328] - - [489, 2177.65] + - [595, 2177.65] - - [5056, 1856, 1, 256] - - [523, 5608.72] + - [629, 5608.72] - - [4608, 5984, 1, 1536] - - [512, 7859.85] + - [618, 7859.85] - - [256, 128, 1, 256] - - [448, 998.744] + - [554, 998.744] - - [1760, 3200, 1, 1760] - - [505, 8179.64] + - [611, 8179.64] - - [1024, 1856, 1, 256] - - [515, 6143.27] + - [621, 6143.27] - - [4096, 1600, 1, 1024] - - [523, 5851.52] + - [629, 5851.52] - - [4288, 64, 1, 128] - - [428, 1372.26] + - [534, 1372.26] - - [256, 448, 1, 3328] - - [467, 4795.1] + - [573, 4795.1] - - [1408, 6784, 1, 1280] - - [509, 8426.5] + - [615, 8426.5] - - [3584, 3584, 1, 1280] - - [509, 7556.56] + - [615, 7556.56] - - [7680, 24000, 1, 2560] - - [502, 5019.19] + - [608, 5019.19] - - [64, 2368, 1, 1280] - - [453, 4061.8] + - [559, 4061.8] - - [448, 2368, 1, 1280] - - [504, 5928.77] + - [610, 5928.77] - - [4608, 48000, 1, 1536] - - [509, 6937.4] + - [615, 6937.4] - - [5888, 5888, 1, 128] - - [494, 3744.0] + - [600, 3744.0] - - [64, 6784, 1, 3328] - - [504, 5988.72] + - [610, 5988.72] - - [2944, 256, 1, 1280] - - [510, 6717.97] + - [616, 6717.97] - - [2048, 16, 1, 2048] - - [462, 1210.58] + - [568, 1210.58] - - [256, 2368, 1, 128] - - [497, 1936.07] + - [603, 1936.07] - - [5056, 2368, 1, 3328] - - [515, 8875.63] + - [621, 8875.63] - - [2944, 4288, 1, 256] - - [509, 8063.24] + - [615, 8063.24] - - [1408, 3584, 1, 1280] - - [505, 8197.07] + - [611, 8197.07] - - [2368, 64, 1, 256] - - [504, 2365.79] + - [610, 2365.79] - - [64, 448, 1, 3328] - - [490, 3027.4] + - [596, 3027.4] - - [704, 128, 1, 3328] - - [461, 4452.19] + - [567, 4452.19] - - [8192, 1600, 1, 2048] - - [509, 7229.93] + - [615, 7229.93] - - [1856, 704, 1, 256] - - [511, 5545.45] + - [617, 5545.45] - - [4, 4288, 1, 1280] - - [462, 523.825] + - [568, 523.825] - - [1408, 448, 1, 3328] - - [516, 4789.4] + - [622, 4789.4] - - [1024, 4, 1, 3328] - - [442, 504.223] + - [548, 504.223] - - [512, 24000, 1, 2560] - - [515, 8903.62] + - [621, 8903.62] - - [2368, 6784, 1, 3328] - - [515, 8311.14] + - [621, 8311.14] - - [1856, 1408, 1, 1280] - - [505, 8160.11] + - [611, 8160.11] - - [1856, 448, 1, 1280] - - [507, 6243.07] + - [613, 6243.07] - - [6784, 704, 1, 128] - - [493, 4069.05] + - [599, 4069.05] - - [4, 4, 1, 256] - - [462, 0.842029] + - [568, 0.842029] - - [128, 5888, 1, 128] - - [493, 2328.02] + - [599, 2328.02] - - [1408, 5888, 1, 256] - - [504, 6986.91] + - [610, 6986.91] - - [704, 2944, 1, 1280] - - [505, 7905.03] + - [611, 7905.03] - - [4288, 64, 1, 1280] - - [480, 3828.27] + - [586, 3828.27] - - [256, 64, 1, 256] - - [455, 655.46] + - [561, 655.46] - - [704, 1856, 1, 256] - - [513, 5444.37] + - [619, 5444.37] - - [704, 6784, 1, 128] - - [493, 4319.77] + - [599, 4319.77] - - [3584, 704, 1, 1280] - - [513, 7726.43] + - [619, 7726.43] - - [256, 128, 1, 1280] - - [448, 2184.63] + - [554, 2184.63] - - [5888, 2368, 1, 256] - - [515, 8192.69] + - [621, 8192.69] - - [256, 2368, 1, 1280] - - [510, 5675.54] + - [616, 5675.54] - - [2944, 6784, 1, 128] - - [498, 4248.35] + - [604, 4248.35] - - [3584, 448, 1, 3328] - - [509, 6560.77] + - [615, 6560.77] - - [1408, 4, 1, 256] - - [527, 176.79] + - [633, 176.79] - - [704, 2368, 1, 3328] - - [510, 7085.31] + - [616, 7085.31] - - [2944, 448, 1, 256] - - [506, 3412.0] + - [612, 3412.0] - - [1856, 448, 1, 128] - - [494, 2748.82] + - [600, 2748.82] - - [4288, 4, 1, 3328] - - [462, 553.648] + - [568, 553.648] - - [2368, 128, 1, 1280] - - [483, 4173.65] + - [589, 4173.65] - - [256, 5888, 1, 128] - - [498, 2860.98] + - [604, 2860.98] - - [64, 6784, 1, 256] - - [511, 3637.18] + - [617, 3637.18] - - [64, 5056, 1, 1280] - - [510, 4289.53] + - [616, 4289.53] - - [4, 6784, 1, 128] - - [524, 160.906] + - [630, 160.906] - - [2048, 3200, 1, 512] - - [511, 6927.09] + - [617, 6927.09] - - [2944, 2944, 1, 1280] - - [503, 6267.85] + - [609, 6267.85] - - [5056, 448, 1, 3328] - - [504, 7400.36] + - [610, 7400.36] - - [4, 3584, 1, 1280] - - [462, 499.83] + - [568, 499.83] - - [1408, 128, 1, 128] - - [439, 1037.36] + - [545, 1037.36] - - [6784, 704, 1, 3328] - - [510, 7633.95] + - [616, 7633.95] - - [128, 64, 1, 1280] - - [462, 1170.39] + - [568, 1170.39] - - [2368, 256, 1, 1280] - - [510, 5609.89] + - [616, 5609.89] - - [4, 448, 1, 3328] - - [530, 358.5] + - [636, 358.5] - - [5888, 4288, 1, 128] - - [498, 4521.74] + - [604, 4521.74] - - [4, 5888, 1, 256] - - [462, 353.933] + - [568, 353.933] - - [1408, 2944, 1, 3328] - - [503, 8951.41] + - [609, 8951.41] - - [3584, 704, 1, 128] - - [493, 3395.41] + - [599, 3395.41] - - [4608, 12000, 1, 1536] - - [502, 6609.99] + - [608, 6609.99] - - [64, 1024, 1, 256] - - [448, 1588.85] + - [554, 1588.85] - - [5056, 5056, 1, 128] - - [493, 4080.81] + - [599, 4080.81] - - [2368, 448, 1, 1280] - - [504, 5423.04] + - [610, 5423.04] - - [128, 3584, 1, 256] - - [510, 4705.25] + - [616, 4705.25] - - [704, 448, 1, 1280] - - [507, 3961.07] + - [613, 3961.07] - - [8192, 800, 1, 2048] - - [505, 6306.36] + - [611, 6306.36] - - [448, 5056, 1, 128] - - [497, 3709.56] + - [603, 3709.56] - - [256, 4, 1, 1280] - - [529, 163.94] + - [635, 163.94] - - [5056, 3584, 1, 256] - - [502, 7008.34] + - [608, 7008.34] - - [2368, 4, 1, 3328] - - [462, 496.366] + - [568, 496.366] - - [1408, 5056, 1, 128] - - [497, 4175.37] + - [603, 4175.37] - - [2944, 3584, 1, 128] - - [493, 4659.79] + - [599, 4659.79] - - [3584, 2368, 1, 256] - - [515, 5851.87] + - [621, 5851.87] - - [128, 3584, 1, 3328] - - [505, 6105.04] + - [611, 6105.04] - - [128, 1024, 1, 1280] - - [445, 3848.09] + - [551, 3848.09] - - [8448, 24000, 1, 2816] - - [515, 5128.64] + - [621, 5128.64] - - [64, 704, 1, 256] - - [448, 1253.83] + - [554, 1253.83] - - [4288, 256, 1, 1280] - - [504, 5625.86] + - [610, 5625.86] - - [3584, 3584, 1, 3328] - - [509, 8206.15] + - [615, 8206.15] - - [4, 704, 1, 128] - - [524, 29.5484] + - [630, 29.5484] - - [5888, 6784, 1, 256] - - [511, 8248.75] + - [617, 8248.75] - - [4288, 2944, 1, 3328] - - [509, 8657.12] + - [615, 8657.12] - - [2944, 64, 1, 128] - - [428, 1240.7] + - [534, 1240.7] - - [1024, 128, 1, 3328] - - [453, 4433.1] + - [559, 4433.1] - - [1024, 16, 1, 500000] - - [416, 2571.15] + - [522, 2571.15] - - [4288, 128, 1, 3328] - - [453, 5716.85] + - [559, 5716.85] - - [7680, 128, 1, 2560] - - [451, 5488.1] + - [557, 5488.1] - - [256, 5056, 1, 1280] - - [511, 6380.06] + - [617, 6380.06] - - [1408, 256, 1, 128] - - [497, 1633.83] + - [603, 1633.83] - - [2944, 5888, 1, 3328] - - [506, 7849.02] + - [612, 7849.02] - - [6784, 5888, 1, 1280] - - [515, 9047.72] + - [621, 9047.72] - - [2048, 800, 1, 512] - - [510, 4841.17] + - [616, 4841.17] - - [704, 128, 1, 256] - - [455, 1567.27] + - [561, 1567.27] - - [5888, 4288, 1, 1280] - - [509, 7982.93] + - [615, 7982.93] - - [1024, 24000, 1, 2048] - - [511, 5774.4] + - [617, 5774.4] - - [448, 256, 1, 1280] - - [445, 3707.19] + - [551, 3707.19] - - [5888, 3584, 1, 128] - - [498, 3804.5] + - [604, 3804.5] - - [1024, 2944, 1, 128] - - [493, 3308.36] + - [599, 3308.36] - - [5056, 4, 1, 1280] - - [526, 469.062] + - [632, 469.062] - - [256, 1408, 1, 1280] - - [504, 4899.99] + - [610, 4899.99] - - [3072, 16, 1, 1024] - - [462, 1233.72] + - [568, 1233.72] - - [704, 3584, 1, 128] - - [493, 3919.53] + - [599, 3919.53] - - [5888, 448, 1, 3328] - - [523, 6095.71] + - [629, 6095.71] - - [2368, 4288, 1, 1280] - - [505, 8338.4] + - [611, 8338.4] - - [4288, 2944, 1, 128] - - [497, 3946.6] + - [603, 3946.6] - - [1024, 6784, 1, 3328] - - [511, 7494.38] + - [617, 7494.38] - - [128, 2368, 1, 256] - - [510, 2895.42] + - [616, 2895.42] - - [6784, 64, 1, 3328] - - [504, 5964.99] + - [610, 5964.99] - - [5056, 2944, 1, 3328] - - [515, 6605.63] + - [621, 6605.63] - - [448, 128, 1, 256] - - [448, 1339.52] + - [554, 1339.52] - - [2944, 3584, 1, 256] - - [511, 7165.66] + - [617, 7165.66] - - [1408, 1408, 1, 3328] - - [515, 8332.96] + - [621, 8332.96] - - [1856, 128, 1, 1280] - - [510, 4498.43] + - [616, 4498.43] - - [3584, 3584, 1, 128] - - [494, 4000.11] + - [600, 4000.11] - - [64, 3584, 1, 256] - - [521, 2383.23] + - [627, 2383.23] - - [1408, 4, 1, 3328] - - [472, 423.008] + - [578, 423.008] - - [128, 2944, 1, 3328] - - [477, 5430.03] + - [583, 5430.03] - - [3584, 704, 1, 256] - - [510, 6154.09] + - [616, 6154.09] - - [2944, 448, 1, 3328] - - [510, 6507.82] + - [616, 6507.82] - - [3584, 1408, 1, 3328] - - [515, 8829.73] + - [621, 8829.73] - - [704, 3584, 1, 1280] - - [505, 7860.33] + - [611, 7860.33] - - [2944, 6784, 1, 1280] - - [515, 8894.6] + - [621, 8894.6] - - [1856, 6784, 1, 256] - - [515, 8115.19] + - [621, 8115.19] - - [4288, 448, 1, 3328] - - [507, 6397.35] + - [613, 6397.35] - - [6784, 4288, 1, 128] - - [493, 4109.54] + - [599, 4109.54] - - [6784, 704, 1, 1280] - - [503, 7999.14] + - [609, 7999.14] - - [256, 4288, 1, 256] - - [507, 4603.94] + - [613, 4603.94] - - [3584, 6784, 1, 256] - - [515, 7361.65] + - [621, 7361.65] - - [6144, 12000, 1, 2048] - - [514, 6311.76] + - [620, 6311.76] - - [6144, 16, 1, 2560] - - [463, 2240.65] + - [569, 2240.65] - - [3584, 64, 1, 128] - - [434, 1292.36] + - [540, 1292.36] - - [5888, 1024, 1, 3328] - - [502, 8394.59] + - [608, 8394.59] - - [448, 64, 1, 128] - - [425, 262.244] + - [531, 262.244] - - [704, 6784, 1, 1280] - - [509, 7740.66] + - [615, 7740.66] - - [4, 1024, 1, 1280] - - [462, 378.921] + - [568, 378.921] - - [5888, 128, 1, 256] - - [510, 5003.68] + - [616, 5003.68] - - [4096, 16, 1, 4096] - - [462, 1585.85] + - [568, 1585.85] - - [1856, 5056, 1, 3328] - - [503, 8522.92] + - [609, 8522.92] - - [4, 6784, 1, 256] - - [447, 387.757] + - [553, 387.757] - - [1024, 3584, 1, 128] - - [497, 3031.61] + - [603, 3031.61] - - [1024, 1408, 1, 128] - - [499, 2600.85] + - [605, 2600.85] - - [2368, 2944, 1, 128] - - [496, 4340.26] + - [602, 4340.26] - - [5056, 64, 1, 256] - - [510, 3109.62] + - [616, 3109.62] - - [4, 448, 1, 1280] - - [530, 253.835] + - [636, 253.835] - - [5056, 2944, 1, 128] - - [501, 3740.01] + - [607, 3740.01] - - [5888, 5056, 1, 3328] - - [515, 9016.48] + - [621, 9016.48] - - [1024, 704, 1, 128] - - [497, 2363.66] + - [603, 2363.66] - - [5888, 2368, 1, 128] - - [500, 3651.83] + - [606, 3651.83] - - [128, 5056, 1, 3328] - - [504, 6243.64] + - [610, 6243.64] - - [3584, 6784, 1, 1280] - - [502, 9080.67] + - [608, 9080.67] - - [448, 4, 1, 1280] - - [530, 243.083] + - [636, 243.083] - - [1856, 5888, 1, 256] - - [515, 8182.12] + - [621, 8182.12] - - [256, 256, 1, 256] - - [448, 1542.12] + - [554, 1542.12] - - [256, 64, 1, 128] - - [429, 135.226] + - [535, 135.226] - - [4288, 4288, 1, 3328] - - [515, 8674.64] + - [621, 8674.64] - - [4288, 1408, 1, 1280] - - [503, 7867.18] + - [609, 7867.18] - - [3584, 5056, 1, 128] - - [493, 4457.83] + - [599, 4457.83] - - [4, 1024, 1, 3328] - - [442, 440.394] + - [548, 440.394] - - [4288, 2368, 1, 256] - - [523, 5699.57] + - [629, 5699.57] - - [2944, 5056, 1, 1280] - - [515, 8236.56] + - [621, 8236.56] - - [448, 6784, 1, 256] - - [505, 6620.62] + - [611, 6620.62] - - [64, 128, 1, 128] - - [430, 67.6629] + - [536, 67.6629] - - [1856, 2368, 1, 128] - - [497, 4233.7] + - [603, 4233.7] - - [6784, 2368, 1, 3328] - - [515, 8269.9] + - [621, 8269.9] - - [256, 1024, 1, 1280] - - [504, 4882.88] + - [610, 4882.88] - - [704, 4, 1, 128] - - [524, 19.111] + - [630, 19.111] - - [256, 4, 1, 256] - - [462, 46.9114] + - [568, 46.9114] - - [4288, 128, 1, 256] - - [510, 4273.49] + - [616, 4273.49] - - [4288, 1856, 1, 3328] - - [505, 8195.81] + - [611, 8195.81] - - [3584, 448, 1, 128] - - [498, 2750.65] + - [604, 2750.65] - - [2048, 1600, 1, 2048] - - [521, 5753.59] + - [627, 5753.59] - - [256, 4, 1, 3328] - - [531, 297.978] + - [637, 297.978] - - [4, 1408, 1, 1280] - - [529, 402.386] + - [635, 402.386] - - [3584, 64, 1, 1280] - - [518, 4096.1] + - [624, 4096.1] - - [1408, 448, 1, 128] - - [493, 2498.25] + - [599, 2498.25] - - [3584, 1024, 1, 1280] - - [515, 7252.18] + - [621, 7252.18] - - [1856, 5056, 1, 256] - - [509, 7711.59] + - [615, 7711.59] - - [4, 3584, 1, 256] - - [526, 314.314] + - [632, 314.314] - - [4, 2944, 1, 1280] - - [462, 483.218] + - [568, 483.218] - - [1024, 4288, 1, 256] - - [514, 6544.52] + - [620, 6544.52] - - [5888, 3584, 1, 3328] - - [503, 8105.15] + - [609, 8105.15] - - [1856, 4, 1, 256] - - [462, 252.832] + - [568, 252.832] - - [4, 256, 1, 256] - - [447, 48.2882] + - [553, 48.2882] - - [5056, 3584, 1, 3328] - - [508, 7354.8] + - [614, 7354.8] - - [704, 448, 1, 128] - - [501, 1233.91] + - [607, 1233.91] - - [2368, 1408, 1, 1280] - - [509, 6654.24] + - [615, 6654.24] - - [5056, 2944, 1, 1280] - - [515, 8505.72] + - [621, 8505.72] - - [4, 4, 1, 128] - - [525, 0.1478505] + - [631, 0.1478505] - - [3584, 256, 1, 256] - - [507, 4616.47] + - [613, 4616.47] - - [1024, 6784, 1, 256] - - [509, 7944.98] + - [615, 7944.98] - - [4, 128, 1, 256] - - [462, 29.3571] + - [568, 29.3571] - - [64, 64, 1, 1280] - - [473, 642.61] + - [579, 642.61] - - [5124, 9124, 1, 2048] - - [515, 8019.4] + - [621, 8019.4] - - [6784, 4, 1, 128] - - [524, 193.067] + - [630, 193.067] - - [2944, 1408, 1, 128] - - [493, 3827.13] + - [599, 3827.13] - - [448, 128, 1, 3328] - - [466, 4064.0] + - [572, 4064.0] - - [3584, 1408, 1, 1280] - - [515, 7180.83] + - [621, 7180.83] - - [64, 4288, 1, 3328] - - [461, 4786.84] + - [567, 4786.84] - - [5056, 6784, 1, 3328] - - [502, 7889.83] + - [608, 7889.83] - - [128, 2944, 1, 256] - - [505, 3599.69] + - [611, 3599.69] - - [128, 6784, 1, 128] - - [423, 2606.79] + - [529, 2606.79] - - [3584, 4288, 1, 256] - - [509, 7299.81] + - [615, 7299.81] - - [448, 1856, 1, 256] - - [505, 5207.07] + - [611, 5207.07] - - [1856, 6784, 1, 3328] - - [507, 8386.36] + - [613, 8386.36] - - [3584, 128, 1, 3328] - - [451, 5590.04] + - [557, 5590.04] - - [64, 1856, 1, 256] - - [444, 1949.38] + - [550, 1949.38] - - [64, 448, 1, 256] - - [449, 955.833] + - [555, 955.833] - - [5888, 4288, 1, 256] - - [513, 7791.84] + - [619, 7791.84] - - [4, 448, 1, 128] - - [524, 8.84146] + - [630, 8.84146] - - [5056, 1408, 1, 256] - - [515, 5154.01] + - [621, 5154.01] - - [35, 8457, 1, 2048] - - [420, 3182.57] + - [526, 3182.57] - - [64, 256, 1, 1280] - - [469, 1713.46] + - [575, 1713.46] - - [3584, 1024, 1, 256] - - [505, 6528.18] + - [611, 6528.18] - - [256, 704, 1, 256] - - [504, 2720.46] + - [610, 2720.46] - - [5888, 5888, 1, 256] - - [513, 7992.26] + - [619, 7992.26] - - [4288, 1024, 1, 1280] - - [507, 7837.5] + - [613, 7837.5] - - [5888, 128, 1, 3328] - - [510, 7181.13] + - [616, 7181.13] - - [448, 6784, 1, 3328] - - [504, 7663.1] + - [610, 7663.1] - - [2944, 1408, 1, 1280] - - [513, 7903.14] + - [619, 7903.14] - - [64, 128, 1, 1280] - - [462, 1191.66] + - [568, 1191.66] - - [2944, 1856, 1, 3328] - - [503, 7844.41] + - [609, 7844.41] - - [2368, 64, 1, 128] - - [434, 997.973] + - [540, 997.973] - - [256, 1024, 1, 128] - - [493, 1215.84] + - [599, 1215.84] - - [3584, 5888, 1, 1280] - - [502, 8958.94] + - [608, 8958.94] - - [64, 4, 1, 128] - - [525, 1.21608] + - [631, 1.21608] - - [6784, 1856, 1, 1280] - - [502, 6728.8] + - [608, 6728.8] - - [2944, 5056, 1, 256] - - [515, 8275.21] + - [621, 8275.21] - - [4288, 4, 1, 128] - - [524, 147.644] + - [630, 147.644] - - [5888, 256, 1, 3328] - - [511, 7094.2] + - [617, 7094.2] - - [2944, 4288, 1, 128] - - [496, 4611.55] + - [602, 4611.55] - - [3584, 1408, 1, 256] - - [506, 6543.06] + - [612, 6543.06] - - [704, 3584, 1, 3328] - - [505, 8117.2] + - [611, 8117.2] - - [4096, 3200, 1, 1024] - - [520, 6656.13] + - [626, 6656.13] - - [5056, 448, 1, 1280] - - [518, 6096.2] + - [624, 6096.2] - - [3584, 1856, 1, 3328] - - [503, 8552.41] + - [609, 8552.41] - - [4288, 6784, 1, 1280] - - [509, 8212.46] + - [615, 8212.46] - - [2560, 7000, 1, 2560] - - [511, 7655.34] + - [617, 7655.34] - - [1408, 704, 1, 1280] - - [507, 5756.79] + - [613, 5756.79] - - [2944, 1024, 1, 256] - - [515, 6880.91] + - [621, 6880.91] - - [6784, 64, 1, 256] - - [510, 4438.96] + - [616, 4438.96] - - [2368, 4288, 1, 3328] - - [511, 8377.99] + - [617, 8377.99] - - [4, 1408, 1, 256] - - [528, 222.599] + - [634, 222.599] - - [1024, 1408, 1, 1280] - - [505, 6339.38] + - [611, 6339.38] - - [64, 64, 1, 256] - - [462, 187.346] + - [568, 187.346] - - [704, 256, 1, 3328] - - [504, 4046.14] + - [610, 4046.14] - - [6784, 5056, 1, 256] - - [515, 7972.17] + - [621, 7972.17] - - [1856, 1856, 1, 128] - - [499, 3716.61] + - [605, 3716.61] - - [3584, 5056, 1, 3328] - - [515, 8684.76] + - [621, 8684.76] - - [448, 6784, 1, 128] - - [497, 3829.05] + - [603, 3829.05] - - [4, 704, 1, 3328] - - [530, 393.206] + - [636, 393.206] - - [35, 8457, 1, 4096] - - [419, 3173.24] + - [525, 3173.24] - - [448, 2944, 1, 256] - - [513, 5553.41] + - [619, 5553.41] - - [4, 4288, 1, 3328] - - [472, 573.211] + - [578, 573.211] - - [2944, 6784, 1, 256] - - [509, 8566.06] + - [615, 8566.06] - - [2944, 2944, 1, 128] - - [493, 4540.83] + - [599, 4540.83] - - [4, 4, 1, 1280] - - [472, 3.14762] + - [578, 3.14762] - - [1856, 3584, 1, 1280] - - [509, 7306.36] + - [615, 7306.36] - - [64, 2944, 1, 256] - - [521, 2292.61] + - [627, 2292.61] - - [448, 256, 1, 128] - - [430, 797.93] + - [536, 797.93] - - [4288, 448, 1, 128] - - [496, 3430.5] + - [602, 3430.5] - - [4608, 24000, 1, 1536] - - [514, 6820.24] + - [620, 6820.24] - - [1856, 1408, 1, 3328] - - [517, 6600.24] + - [623, 6600.24] - - [128, 128, 1, 128] - - [422, 161.917] + - [528, 161.917] - - [1024, 4288, 1, 3328] - - [505, 7937.08] + - [611, 7937.08] - - [448, 2368, 1, 256] - - [513, 4526.45] + - [619, 4526.45] - - [1024, 4, 1, 128] - - [525, 16.9907] + - [631, 16.9907] - - [64, 1408, 1, 1280] - - [445, 3345.32] + - [551, 3345.32] - - [64, 6784, 1, 1280] - - [510, 5526.6] + - [616, 5526.6] - - [5056, 448, 1, 256] - - [504, 4216.65] + - [610, 4216.65] - - [2944, 2368, 1, 3328] - - [515, 7000.42] + - [621, 7000.42] - - [704, 4288, 1, 3328] - - [521, 6414.43] + - [627, 6414.43] - - [1408, 128, 1, 256] - - [504, 2720.46] + - [610, 2720.46] - - [1024, 1856, 1, 1280] - - [515, 7682.93] + - [621, 7682.93] - - [2048, 6400, 1, 2048] - - [511, 7418.22] + - [617, 7418.22] - - [512, 48000, 1, 2816] - - [515, 8884.77] + - [621, 8884.77] - - [5124, 9124, 1, 2560] - - [507, 6040.8] + - [613, 6040.8] - - [128, 2368, 1, 3328] - - [461, 5025.66] + - [567, 5025.66] - - [1024, 5888, 1, 256] - - [509, 7322.21] + - [615, 7322.21] - - [64, 2944, 1, 1280] - - [445, 4222.31] + - [551, 4222.31] - - [5056, 64, 1, 3328] - - [486, 4936.32] + - [592, 4936.32] - - [128, 704, 1, 128] - - [431, 683.414] + - [537, 683.414] - - [1408, 2368, 1, 256] - - [510, 6404.22] + - [616, 6404.22] - - [1408, 1408, 1, 256] - - [515, 4537.93] + - [621, 4537.93] - - [4, 64, 1, 128] - - [524, 2.56747] + - [630, 2.56747] - - [64, 1024, 1, 128] - - [423, 532.372] + - [529, 532.372] - - [1024, 8, 1, 500000] - - [413, 1685.08] + - [519, 1685.08] - - [2368, 2368, 1, 128] - - [494, 4334.33] + - [600, 4334.33] - - [64, 5888, 1, 128] - - [423, 2003.19] + - [529, 2003.19] - - [5888, 4, 1, 3328] - - [441, 339.118] + - [547, 339.118] - - [6784, 1408, 1, 128] - - [497, 4431.23] + - [603, 4431.23] - - [4288, 5888, 1, 256] - - [515, 7800.88] + - [621, 7800.88] - - [1408, 5056, 1, 256] - - [509, 8153.38] + - [615, 8153.38] - - [5056, 128, 1, 3328] - - [466, 5829.93] + - [572, 5829.93] - - [128, 128, 1, 1280] - - [469, 1691.35] + - [575, 1691.35] - - [448, 704, 1, 256] - - [510, 3364.28] + - [616, 3364.28] - - [4288, 3584, 1, 128] - - [494, 2952.68] + - [600, 2952.68] - - [2944, 128, 1, 3328] - - [466, 5620.82] + - [572, 5620.82] - - [64, 1408, 1, 3328] - - [467, 4169.91] + - [573, 4169.91] - - [3584, 5056, 1, 1280] - - [512, 7780.76] + - [618, 7780.76] - - [256, 448, 1, 1280] - - [445, 3929.45] + - [551, 3929.45] - - [704, 704, 1, 128] - - [493, 2346.17] + - [599, 2346.17] - - [5056, 4, 1, 128] - - [524, 144.557] + - [630, 144.557] - - [704, 256, 1, 1280] - - [513, 2283.22] + - [619, 2283.22] - - [64, 2368, 1, 3328] - - [445, 4921.69] + - [551, 4921.69] - - [1856, 1024, 1, 128] - - [494, 3459.57] + - [600, 3459.57] - - [1856, 64, 1, 128] - - [426, 918.237] + - [532, 918.237] - - [4096, 64, 1, 4096] - - [471, 4000.62] + - [577, 4000.62] - - [1024, 24000, 1, 1536] - - [507, 8502.36] + - [613, 8502.36] - - [704, 4288, 1, 256] - - [511, 6003.83] + - [617, 6003.83] - - [5888, 2368, 1, 1280] - - [502, 8801.3] + - [608, 8801.3] - - [128, 256, 1, 256] - - [456, 1070.08] + - [562, 1070.08] - - [64, 128, 1, 256] - - [462, 374.591] + - [568, 374.591] - - [2368, 5888, 1, 1280] - - [505, 8308.63] + - [611, 8308.63] - - [5888, 256, 1, 1280] - - [513, 7154.42] + - [619, 7154.42] - - [1760, 128, 1, 1760] - - [454, 5363.91] + - [560, 5363.91] - - [4, 5888, 1, 1280] - - [462, 542.304] + - [568, 542.304] - - [704, 128, 1, 128] - - [434, 779.447] + - [540, 779.447] - - [1024, 4, 1, 1280] - - [462, 392.531] + - [568, 392.531] - - [2368, 1856, 1, 3328] - - [505, 7975.32] + - [611, 7975.32] - - [2368, 128, 1, 128] - - [427, 1584.96] + - [533, 1584.96] - - [2944, 704, 1, 256] - - [513, 4039.21] + - [619, 4039.21] - - [5056, 128, 1, 128] - - [493, 2575.89] + - [599, 2575.89] - - [2368, 1024, 1, 3328] - - [521, 6165.54] + - [627, 6165.54] - - [256, 704, 1, 3328] - - [504, 4028.74] + - [610, 4028.74] - - [704, 3584, 1, 256] - - [515, 6102.92] + - [621, 6102.92] - - [704, 2944, 1, 3328] - - [505, 8202.84] + - [611, 8202.84] - - [6784, 1024, 1, 128] - - [497, 4386.4] + - [603, 4386.4] - - [256, 448, 1, 128] - - [434, 834.195] + - [540, 834.195] - - [448, 1024, 1, 3328] - - [522, 5412.48] + - [628, 5412.48] - - [2944, 1024, 1, 3328] - - [515, 6265.87] + - [621, 6265.87] - - [2944, 5056, 1, 128] - - [493, 4770.88] + - [599, 4770.88] - - [2368, 256, 1, 256] - - [510, 3975.23] + - [616, 3975.23] - - [1408, 6784, 1, 256] - - [509, 7987.02] + - [615, 7987.02] - - [6784, 1408, 1, 3328] - - [509, 8472.71] + - [615, 8472.71] - - [4288, 6784, 1, 128] - - [500, 3865.2] + - [606, 3865.2] - - [704, 64, 1, 256] - - [448, 1287.41] + - [554, 1287.41] - - [5888, 4, 1, 1280] - - [447, 510.022] + - [553, 510.022] - - [256, 2368, 1, 3328] - - [510, 5837.65] + - [616, 5837.65] - - [6784, 2944, 1, 1280] - - [515, 8560.54] + - [621, 8560.54] - - [4288, 1856, 1, 128] - - [493, 4617.07] + - [599, 4617.07] - - [1856, 2944, 1, 128] - - [493, 4287.73] + - [599, 4287.73] - - [6784, 448, 1, 128] - - [497, 3893.43] + - [603, 3893.43] - - [64, 3584, 1, 128] - - [423, 1609.76] + - [529, 1609.76] - - [448, 5056, 1, 1280] - - [513, 7124.41] + - [619, 7124.41] - - [2368, 1856, 1, 128] - - [496, 4004.65] + - [602, 4004.65] - - [64, 2944, 1, 3328] - - [446, 5086.48] + - [552, 5086.48] - - [4288, 704, 1, 256] - - [511, 6176.57] + - [617, 6176.57] - - [256, 3584, 1, 128] - - [494, 2553.15] + - [600, 2553.15] - - [5888, 704, 1, 256] - - [510, 6781.51] + - [616, 6781.51] - - [3584, 1024, 1, 128] - - [497, 3660.95] + - [603, 3660.95] - - [256, 5888, 1, 3328] - - [513, 7772.13] + - [619, 7772.13] - - [1408, 4288, 1, 3328] - - [509, 8832.86] + - [615, 8832.86] - - [6784, 4288, 1, 256] - - [515, 8566.14] + - [621, 8566.14] - - [4288, 256, 1, 128] - - [495, 1953.79] + - [601, 1953.79] - - [5888, 256, 1, 256] - - [513, 3730.53] + - [619, 3730.53] - - [6784, 1024, 1, 1280] - - [509, 8578.39] + - [615, 8578.39] - - [5888, 1024, 1, 128] - - [494, 4092.96] + - [600, 4092.96] - - [1024, 128, 1, 256] - - [444, 1897.98] + - [550, 1897.98] - - [512, 16, 1, 500000] - - [415, 2363.79] + - [521, 2363.79] - - [128, 64, 1, 3328] - - [472, 1592.56] + - [578, 1592.56] - - [448, 64, 1, 256] - - [462, 976.168] + - [568, 976.168] - - [2368, 256, 1, 128] - - [497, 2094.99] + - [603, 2094.99] - - [6784, 3584, 1, 1280] - - [509, 8570.16] + - [615, 8570.16] - - [1024, 6784, 1, 1280] - - [515, 8203.57] + - [621, 8203.57] - - [2944, 64, 1, 1280] - - [453, 4300.61] + - [559, 4300.61] - - [1408, 2944, 1, 1280] - - [505, 7349.64] + - [611, 7349.64] - - [256, 1856, 1, 256] - - [504, 4649.75] + - [610, 4649.75] - - [2048, 800, 1, 2048] - - [523, 4668.73] + - [629, 4668.73] - - [1408, 2368, 1, 3328] - - [513, 7537.74] + - [619, 7537.74] - - [2944, 4, 1, 3328] - - [462, 514.142] + - [568, 514.142] - - [128, 1408, 1, 3328] - - [454, 4991.64] + - [560, 4991.64] - - [2944, 1856, 1, 128] - - [493, 4317.39] + - [599, 4317.39] - - [256, 2944, 1, 128] - - [493, 2258.27] + - [599, 2258.27] - - [256, 6784, 1, 128] - - [493, 3147.02] + - [599, 3147.02] - - [2368, 4, 1, 128] - - [525, 33.9286] + - [631, 33.9286] - - [1408, 256, 1, 3328] - - [504, 5077.85] + - [610, 5077.85] - - [1856, 4, 1, 128] - - [525, 21.5025] + - [631, 21.5025] - - [5056, 6784, 1, 128] - - [493, 4945.11] + - [599, 4945.11] - - [4288, 5056, 1, 128] - - [496, 4729.87] + - [602, 4729.87] - - [1856, 5888, 1, 128] - - [493, 4707.96] + - [599, 4707.96] - - [2944, 5888, 1, 256] - - [507, 8014.78] + - [613, 8014.78] - - [3584, 1856, 1, 256] - - [509, 7567.13] + - [615, 7567.13] - - [4288, 3584, 1, 1280] - - [502, 8726.43] + - [608, 8726.43] - - [2368, 448, 1, 256] - - [510, 4227.7] + - [616, 4227.7] - - [4288, 256, 1, 3328] - - [511, 5487.41] + - [617, 5487.41] - - [1856, 704, 1, 128] - - [497, 3125.06] + - [603, 3125.06] - - [1408, 64, 1, 256] - - [457, 1620.09] + - [563, 1620.09] - - [64, 1856, 1, 128] - - [421, 955.147] + - [527, 955.147] - - [4, 256, 1, 128] - - [524, 10.8789] + - [630, 10.8789] - - [2560, 16, 1, 2560] - - [469, 2019.7] + - [575, 2019.7] - - [704, 5888, 1, 128] - - [498, 3976.26] + - [604, 3976.26] - - [6784, 3584, 1, 128] - - [497, 4018.91] + - [603, 4018.91] - - [1024, 64, 1, 256] - - [462, 1370.79] + - [568, 1370.79] - - [64, 2368, 1, 256] - - [504, 2255.76] + - [610, 2255.76] - - [4288, 5056, 1, 3328] - - [509, 8368.69] + - [615, 8368.69] - - [4, 1856, 1, 1280] - - [462, 392.126] + - [568, 392.126] - - [4288, 128, 1, 128] - - [427, 2287.03] + - [533, 2287.03] - - [1408, 1408, 1, 128] - - [497, 3233.48] + - [603, 3233.48] - - [7680, 16, 1, 2560] - - [465, 2257.37] + - [571, 2257.37] - - [1856, 128, 1, 128] - - [427, 1532.8] + - [533, 1532.8] - - [5056, 2368, 1, 256] - - [509, 8167.29] + - [615, 8167.29] - - [4288, 704, 1, 3328] - - [515, 6411.16] + - [621, 6411.16] - - [448, 3584, 1, 256] - - [515, 5477.74] + - [621, 5477.74] - - [2368, 64, 1, 1280] - - [445, 3936.52] + - [551, 3936.52] - - [2368, 1024, 1, 1280] - - [511, 7688.82] + - [617, 7688.82] - - [2944, 1408, 1, 3328] - - [502, 7668.78] + - [608, 7668.78] - - [1408, 448, 1, 256] - - [504, 4863.98] + - [610, 4863.98] - - [1024, 1408, 1, 3328] - - [513, 7448.99] + - [619, 7448.99] - - [2944, 5888, 1, 1280] - - [503, 8208.57] + - [609, 8208.57] - - [1408, 4, 1, 1280] - - [442, 479.419] + - [548, 479.419] - - [5888, 3584, 1, 256] - - [503, 8610.09] + - [609, 8610.09] - - [2368, 5056, 1, 128] - - [500, 3726.25] + - [606, 3726.25] - - [1408, 1856, 1, 3328] - - [504, 7829.48] + - [610, 7829.48] - - [4, 4, 1, 3328] - - [531, 4.39419] + - [637, 4.39419] - - [6784, 1408, 1, 1280] - - [504, 7690.8] + - [610, 7690.8] - - [4096, 7000, 1, 4096] - - [516, 6272.49] + - [622, 6272.49] - - [704, 2944, 1, 256] - - [505, 6095.91] + - [611, 6095.91] - - [4288, 64, 1, 256] - - [470, 2121.31] + - [576, 2121.31] - - [6784, 5888, 1, 3328] - - [509, 8955.6] + - [615, 8955.6] - - [2368, 4288, 1, 128] - - [493, 4699.65] + - [599, 4699.65] - - [64, 4288, 1, 1280] - - [483, 4013.73] + - [589, 4013.73] - - [6784, 64, 1, 1280] - - [504, 5418.83] + - [610, 5418.83] - - [3584, 128, 1, 128] - - [433, 2165.3] + - [539, 2165.3] - - [1024, 6784, 1, 128] - - [494, 3765.3] + - [600, 3765.3] - - [4, 1856, 1, 128] - - [525, 33.3728] + - [631, 33.3728] - - [1408, 64, 1, 3328] - - [466, 4489.51] + - [572, 4489.51] - - [6784, 4, 1, 256] - - [462, 400.262] + - [568, 400.262] - - [1408, 1408, 1, 1280] - - [509, 8139.53] + - [615, 8139.53] - - [16384, 400, 1, 4096] - - [513, 6087.28] + - [619, 6087.28] - - [256, 2368, 1, 256] - - [504, 4766.35] + - [610, 4766.35] - - [448, 4288, 1, 3328] - - [511, 7577.08] + - [617, 7577.08] - - [2368, 1408, 1, 256] - - [507, 5284.53] + - [613, 5284.53] - - [5888, 5056, 1, 128] - - [494, 3643.6] + - [600, 3643.6] - - [704, 2368, 1, 256] - - [509, 5334.73] + - [615, 5334.73] - - [1024, 24000, 1, 2560] - - [517, 7438.06] + - [623, 7438.06] - - [2944, 448, 1, 1280] - - [518, 4937.53] + - [624, 4937.53] - - [5888, 2368, 1, 3328] - - [503, 8201.84] + - [609, 8201.84] - - [5124, 9124, 1, 1760] - - [510, 6764.06] + - [616, 6764.06] - - [448, 1408, 1, 1280] - - [504, 5881.54] + - [610, 5881.54] - - [448, 1856, 1, 1280] - - [511, 6225.56] + - [617, 6225.56] - - [4288, 448, 1, 1280] - - [513, 5626.37] + - [619, 5626.37] - - [5888, 704, 1, 3328] - - [507, 7873.62] + - [613, 7873.62] - - [5056, 256, 1, 128] - - [498, 2921.03] + - [604, 2921.03] - - [1856, 256, 1, 128] - - [500, 1995.42] + - [606, 1995.42] - - [64, 1408, 1, 128] - - [421, 758.938] + - [527, 758.938] - - [704, 4, 1, 256] - - [462, 130.697] + - [568, 130.697] - - [1408, 5888, 1, 128] - - [493, 4574.05] + - [599, 4574.05] - - [7680, 12000, 1, 2560] - - [509, 8747.13] + - [615, 8747.13] - - [1408, 1024, 1, 256] - - [506, 4609.23] + - [612, 4609.23] - - [8192, 400, 1, 2048] - - [518, 5283.25] + - [624, 5283.25] - - [1024, 1856, 1, 128] - - [493, 2686.38] + - [599, 2686.38] - - [256, 704, 1, 128] - - [493, 1004.83] + - [599, 1004.83] - - [2560, 128, 1, 2560] - - [471, 4259.14] + - [577, 4259.14] - - [448, 1024, 1, 256] - - [504, 4813.24] + - [610, 4813.24] - - [128, 4, 1, 3328] - - [530, 128.408] + - [636, 128.408] - - [5056, 6784, 1, 1280] - - [512, 6579.85] + - [618, 6579.85] - - [1408, 64, 1, 128] - - [434, 819.3] + - [540, 819.3] - - [1024, 448, 1, 1280] - - [513, 5703.31] + - [619, 5703.31] - - [704, 5056, 1, 3328] - - [505, 7574.49] + - [611, 7574.49] - - [128, 5056, 1, 256] - - [504, 5113.53] + - [610, 5113.53] - - [64, 1024, 1, 3328] - - [489, 3980.1] + - [595, 3980.1] - - [1856, 4, 1, 3328] - - [443, 433.253] + - [549, 433.253] - - [4, 2944, 1, 128] - - [525, 46.6225] + - [631, 46.6225] - - [2368, 2944, 1, 3328] - - [503, 9002.13] + - [609, 9002.13] - - [448, 448, 1, 1280] - - [445, 3969.52] + - [551, 3969.52] - - [2368, 3584, 1, 256] - - [515, 7806.39] + - [621, 7806.39] - - [5056, 3584, 1, 1280] - - [502, 8971.56] + - [608, 8971.56] - - [5124, 9124, 1, 4096] - - [515, 7208.72] + - [621, 7208.72] - - [7680, 48000, 1, 2560] - - [509, 3835.91] + - [615, 3835.91] - - [448, 4, 1, 3328] - - [530, 409.7] + - [636, 409.7] - - [1856, 2944, 1, 1280] - - [502, 7173.71] + - [608, 7173.71] - - [1024, 48000, 1, 2816] - - [509, 8976.26] + - [615, 8976.26] - - [128, 1024, 1, 256] - - [448, 1969.26] + - [554, 1969.26] - - [2944, 1408, 1, 256] - - [511, 4585.12] + - [617, 4585.12] - - [4288, 1408, 1, 3328] - - [505, 8237.27] + - [611, 8237.27] - - [3584, 64, 1, 3328] - - [451, 5183.16] + - [557, 5183.16] - - [5888, 2944, 1, 128] - - [500, 3674.56] + - [606, 3674.56] - - [2944, 1024, 1, 128] - - [497, 3834.32] + - [603, 3834.32] - - [4288, 5056, 1, 1280] - - [509, 8086.1] + - [615, 8086.1] - - [5888, 6784, 1, 1280] - - [503, 6941.32] + - [609, 6941.32] - - [6784, 5056, 1, 128] - - [494, 4860.15] + - [600, 4860.15] - - [256, 1024, 1, 3328] - - [518, 5156.22] + - [624, 5156.22] - - [3584, 4, 1, 256] - - [462, 332.529] + - [568, 332.529] - - [1760, 1600, 1, 1760] - - [505, 6330.76] + - [611, 6330.76] - - [1856, 64, 1, 3328] - - [466, 4756.03] + - [572, 4756.03] - - [4, 128, 1, 3328] - - [530, 160.244] + - [636, 160.244] - - [5888, 1408, 1, 3328] - - [503, 8722.74] + - [609, 8722.74] - - [448, 2944, 1, 128] - - [496, 2997.63] + - [602, 2997.63] - - [2368, 1856, 1, 256] - - [504, 6662.34] + - [610, 6662.34] - - [256, 5056, 1, 256] - - [506, 5256.29] + - [612, 5256.29] - - [128, 3584, 1, 128] - - [425, 2073.56] + - [531, 2073.56] - - [448, 3584, 1, 3328] - - [502, 6833.96] + - [608, 6833.96] - - [4, 5056, 1, 3328] - - [472, 581.523] + - [578, 581.523] - - [704, 2368, 1, 128] - - [493, 3402.29] + - [599, 3402.29] - - [5888, 256, 1, 128] - - [498, 2977.54] + - [604, 2977.54] - - [4, 5056, 1, 128] - - [524, 65.2074] + - [630, 65.2074] - - [448, 256, 1, 256] - - [510, 1764.53] + - [616, 1764.53] - - [704, 4, 1, 3328] - - [462, 398.554] + - [568, 398.554] - - [1408, 256, 1, 256] - - [505, 3463.86] + - [611, 3463.86] - - [3584, 1856, 1, 128] - - [501, 3228.19] + - [607, 3228.19] - - [4288, 4288, 1, 128] - - [497, 4853.93] + - [603, 4853.93] - - [1856, 1024, 1, 3328] - - [521, 5994.68] + - [627, 5994.68] - - [128, 5888, 1, 3328] - - [475, 6512.85] + - [581, 6512.85] - - [1024, 5056, 1, 256] - - [515, 7859.42] + - [621, 7859.42] - - [5888, 5888, 1, 1280] - - [515, 8131.44] + - [621, 8131.44] - - [5056, 5888, 1, 128] - - [494, 4920.71] + - [600, 4920.71] - - [2368, 1408, 1, 3328] - - [513, 7110.74] + - [619, 7110.74] - - [1024, 48000, 1, 1536] - - [513, 8590.82] + - [619, 8590.82] - - [5888, 448, 1, 256] - - [514, 3567.74] + - [620, 3567.74] - - [2560, 3200, 1, 2560] - - [504, 7638.31] + - [610, 7638.31] - - [5888, 6784, 1, 128] - - [494, 3910.92] + - [600, 3910.92] - - [6144, 48000, 1, 2048] - - [515, 3412.95] + - [621, 3412.95] - - [6784, 5056, 1, 1280] - - [506, 7890.22] + - [612, 7890.22] - - [5056, 704, 1, 1280] - - [510, 7665.06] + - [616, 7665.06] - - [1024, 48000, 1, 2560] - - [515, 8188.5] + - [621, 8188.5] - - [4608, 32, 1, 1536] - - [483, 2856.97] + - [589, 2856.97] - - [1024, 2368, 1, 128] - - [493, 3019.35] + - [599, 3019.35] - - [128, 704, 1, 256] - - [444, 1696.33] + - [550, 1696.33] - - [2368, 448, 1, 3328] - - [510, 5799.29] + - [616, 5799.29] - - [128, 5888, 1, 1280] - - [504, 6680.75] + - [610, 6680.75] - - [16384, 800, 1, 4096] - - [509, 6322.22] + - [615, 6322.22] - - [448, 128, 1, 1280] - - [483, 2849.49] + - [589, 2849.49] - - [6784, 4, 1, 3328] - - [462, 563.12] + - [568, 563.12] - - [5888, 5056, 1, 1280] - - [509, 8631.33] + - [615, 8631.33] - - [1024, 64, 1, 3328] - - [484, 3481.96] + - [590, 3481.96] - - [3072, 48000, 1, 1024] - - [509, 9019.49] + - [615, 9019.49] - - [64, 3584, 1, 1280] - - [446, 4327.95] + - [552, 4327.95] - - [6784, 1408, 1, 256] - - [509, 6320.59] + - [615, 6320.59] - - [3584, 5888, 1, 128] - - [496, 4406.79] + - [602, 4406.79] - - [5056, 5888, 1, 256] - - [515, 8037.13] + - [621, 8037.13] - - [2368, 1024, 1, 256] - - [507, 4936.14] + - [613, 4936.14] - - [2944, 1856, 1, 256] - - [515, 7222.32] + - [621, 7222.32] - - [1856, 6784, 1, 1280] - - [505, 8251.81] + - [611, 8251.81] - - [64, 5056, 1, 128] - - [425, 1643.7] + - [531, 1643.7] - - [64, 6784, 1, 128] - - [423, 1929.77] + - [529, 1929.77] - - [448, 704, 1, 128] - - [495, 979.959] + - [601, 979.959] - - [4, 1024, 1, 128] - - [524, 20.1416] + - [630, 20.1416] - - [4288, 3584, 1, 256] - - [509, 8444.14] + - [615, 8444.14] - - [1408, 704, 1, 128] - - [493, 3021.0] + - [599, 3021.0] - - [64, 256, 1, 3328] - - [489, 2227.47] + - [595, 2227.47] - - [6784, 448, 1, 3328] - - [515, 6573.11] + - [621, 6573.11] - - [5056, 1856, 1, 1280] - - [507, 7976.23] + - [613, 7976.23] - - [1408, 1024, 1, 3328] - - [505, 7470.33] + - [611, 7470.33] - - [2368, 256, 1, 3328] - - [510, 5394.37] + - [616, 5394.37] - - [5888, 3584, 1, 1280] - - [502, 9031.55] + - [608, 9031.55] - - [1856, 3584, 1, 3328] - - [517, 7272.6] + - [623, 7272.6] - - [5888, 128, 1, 1280] - - [510, 6684.48] + - [616, 6684.48] - - [1024, 2944, 1, 256] - - [515, 7415.09] + - [621, 7415.09] - - [448, 6784, 1, 1280] - - [511, 7923.78] + - [617, 7923.78] - - [256, 3584, 1, 1280] - - [507, 6901.87] + - [613, 6901.87] - - [704, 5056, 1, 256] - - [512, 5004.55] + - [618, 5004.55] - - [3584, 1024, 1, 3328] - - [504, 7894.63] + - [610, 7894.63] - - [2944, 1856, 1, 1280] - - [509, 7903.27] + - [615, 7903.27] - - [128, 256, 1, 128] - - [422, 325.745] + - [528, 325.745] - - [5056, 256, 1, 256] - - [506, 3356.56] + - [612, 3356.56] - - [2944, 4288, 1, 3328] - - [515, 7813.93] + - [621, 7813.93] - - [2368, 3584, 1, 3328] - - [515, 8371.09] + - [621, 8371.09] - - [2944, 704, 1, 1280] - - [521, 5514.09] + - [627, 5514.09] - - [128, 4, 1, 256] - - [462, 25.3062] + - [568, 25.3062] - - [2944, 3584, 1, 1280] - - [509, 7738.83] + - [615, 7738.83] - - [1856, 5888, 1, 1280] - - [503, 8584.63] + - [609, 8584.63] - - [256, 256, 1, 1280] - - [483, 2962.18] + - [589, 2962.18] - - [2048, 3200, 1, 2048] - - [511, 6911.69] + - [617, 6911.69] - - [4288, 1408, 1, 256] - - [509, 7954.0] + - [615, 7954.0] - - [3584, 64, 1, 256] - - [510, 2780.42] + - [616, 2780.42] - - [64, 1856, 1, 3328] - - [445, 4912.04] + - [551, 4912.04] - - [256, 1408, 1, 128] - - [493, 1373.24] + - [599, 1373.24] - - [5888, 1408, 1, 128] - - [498, 4242.01] + - [604, 4242.01] - - [4288, 2368, 1, 1280] - - [507, 8012.7] + - [613, 8012.7] - - [4, 4288, 1, 256] - - [528, 301.674] + - [634, 301.674] - - [256, 4288, 1, 128] - - [493, 2706.36] + - [599, 2706.36] - - [2048, 128, 1, 2048] - - [488, 2885.26] + - [594, 2885.26] - - [256, 128, 1, 3328] - - [490, 3170.21] + - [596, 3170.21] - - [512, 8, 1, 500000] - - [414, 1915.12] + - [520, 1915.12] - - [6784, 2368, 1, 256] - - [509, 8323.66] + - [615, 8323.66] - - [5888, 128, 1, 128] - - [497, 2466.08] + - [603, 2466.08] - - [1024, 24000, 1, 2816] - - [507, 8131.64] + - [613, 8131.64] - - [7680, 5984, 1, 2560] - - [511, 6040.77] + - [617, 6040.77] - - [4288, 1856, 1, 256] - - [523, 5818.53] + - [629, 5818.53] - - [1856, 256, 1, 3328] - - [504, 6532.03] + - [610, 6532.03] - - [1856, 2944, 1, 256] - - [509, 7312.92] + - [615, 7312.92] - - [5056, 1024, 1, 128] - - [499, 4103.0] + - [605, 4103.0] - - [64, 5888, 1, 1280] - - [504, 5058.25] + - [610, 5058.25] - - [1760, 800, 1, 1760] - - [507, 7280.0] + - [613, 7280.0] - - [6784, 256, 1, 128] - - [497, 3257.69] + - [603, 3257.69] - - [5888, 704, 1, 128] - - [493, 3813.93] + - [599, 3813.93] - - [1408, 2368, 1, 128] - - [494, 3561.27] + - [600, 3561.27] - - [1024, 4288, 1, 1280] - - [513, 7752.74] + - [619, 7752.74] - - [2368, 5056, 1, 3328] - - [516, 7711.91] + - [622, 7711.91] - - [448, 4, 1, 128] - - [524, 18.4795] + - [630, 18.4795] - - [4, 256, 1, 3328] - - [531, 269.71] + - [637, 269.71] - - [4288, 1024, 1, 3328] - - [510, 7910.27] + - [616, 7910.27] - - [6144, 48000, 1, 2560] - - [509, 3541.09] + - [615, 3541.09] - - [1024, 5056, 1, 3328] - - [503, 8509.66] + - [609, 8509.66] - - [1024, 1856, 1, 3328] - - [509, 7907.93] + - [615, 7907.93] - - [704, 704, 1, 1280] - - [521, 5648.15] + - [627, 5648.15] - - [128, 2368, 1, 1280] - - [480, 4145.11] + - [586, 4145.11] - - [1408, 128, 1, 3328] - - [453, 4919.6] + - [559, 4919.6] - - [3584, 256, 1, 1280] - - [505, 5185.56] + - [611, 5185.56] - - [4, 128, 1, 128] - - [524, 3.07891] + - [630, 3.07891] - - [5888, 64, 1, 1280] - - [453, 4499.59] + - [559, 4499.59] - - [3584, 128, 1, 1280] - - [510, 5929.01] + - [616, 5929.01] - - [4, 256, 1, 1280] - - [529, 170.767] + - [635, 170.767] - - [128, 704, 1, 3328] - - [453, 4379.37] + - [559, 4379.37] - - [4288, 6784, 1, 256] - - [503, 7181.09] + - [609, 7181.09] - - [3584, 2944, 1, 3328] - - [509, 8553.3] + - [615, 8553.3] - - [128, 1856, 1, 256] - - [510, 3207.77] + - [616, 3207.77] - - [64, 4288, 1, 256] - - [504, 2907.99] + - [610, 2907.99] - - [4, 3584, 1, 3328] - - [462, 560.605] + - [568, 560.605] - - [64, 4, 1, 3328] - - [531, 67.5025] + - [637, 67.5025] - - [4, 64, 1, 3328] - - [531, 88.8467] + - [637, 88.8467] - - [5888, 2944, 1, 256] - - [509, 7255.77] + - [615, 7255.77] - - [1856, 64, 1, 256] - - [455, 1743.72] + - [561, 1743.72] - - [5056, 128, 1, 1280] - - [510, 6009.79] + - [616, 6009.79] - - [448, 4288, 1, 1280] - - [511, 6466.82] + - [617, 6466.82] - - [448, 1856, 1, 3328] - - [511, 6381.99] + - [617, 6381.99] - - [1024, 4288, 1, 128] - - [496, 3491.87] + - [602, 3491.87] - - [4, 1024, 1, 256] - - [529, 172.563] + - [635, 172.563] - - [5056, 4288, 1, 256] - - [509, 8241.52] + - [615, 8241.52] - - [1024, 448, 1, 256] - - [513, 4218.51] + - [619, 4218.51] - - [1024, 3584, 1, 256] - - [509, 6513.69] + - [615, 6513.69] - - [2944, 128, 1, 1280] - - [453, 4710.48] + - [559, 4710.48] - - [2048, 32, 1, 2048] - - [468, 1779.23] + - [574, 1779.23] - - [64, 256, 1, 256] - - [462, 655.46] + - [568, 655.46] - - [1408, 4, 1, 128] - - [525, 20.1249] + - [631, 20.1249] - - [128, 2368, 1, 128] - - [425, 1707.73] + - [531, 1707.73] - - [256, 704, 1, 1280] - - [504, 3735.31] + - [610, 3735.31] - - [64, 2368, 1, 128] - - [432, 1049.81] + - [538, 1049.81] - - [6784, 6784, 1, 3328] - - [509, 9277.94] + - [615, 9277.94] - - [448, 5888, 1, 1280] - - [515, 7319.75] + - [621, 7319.75] - - [5056, 448, 1, 128] - - [497, 3694.43] + - [603, 3694.43] - - [4288, 704, 1, 1280] - - [507, 7890.96] + - [613, 7890.96] - - [3584, 2944, 1, 128] - - [499, 4124.71] + - [605, 4124.71] - - [6784, 256, 1, 1280] - - [515, 7185.83] + - [621, 7185.83] - - [256, 2944, 1, 1280] - - [504, 6736.76] + - [610, 6736.76] - - [64, 4288, 1, 128] - - [423, 1614.41] + - [529, 1614.41] - - [2368, 5888, 1, 3328] - - [505, 8616.46] + - [611, 8616.46] - - [4, 64, 1, 256] - - [442, 11.4778] + - [548, 11.4778] - - [704, 1024, 1, 3328] - - [510, 6801.92] + - [616, 6801.92] - - [2368, 1856, 1, 1280] - - [507, 7853.57] + - [613, 7853.57] - - [448, 5056, 1, 3328] - - [510, 7453.04] + - [616, 7453.04] - - [128, 448, 1, 128] - - [425, 530.449] + - [531, 530.449] - - [128, 6784, 1, 256] - - [505, 5557.55] + - [611, 5557.55] - - [3584, 4288, 1, 128] - - [496, 4462.73] + - [602, 4462.73] - - [64, 448, 1, 128] - - [425, 278.132] + - [531, 278.132] - - [5888, 4288, 1, 3328] - - [502, 9153.55] + - [608, 9153.55] - - [2368, 704, 1, 256] - - [509, 5350.78] + - [615, 5350.78] - - [256, 1856, 1, 3328] - - [504, 6536.35] + - [610, 6536.35] - - [1856, 128, 1, 256] - - [518, 2847.36] + - [624, 2847.36] - - [6784, 128, 1, 128] - - [498, 2530.82] + - [604, 2530.82] - - [3584, 1408, 1, 128] - - [499, 3625.62] + - [605, 3625.62] - - [1856, 5056, 1, 1280] - - [505, 8123.39] + - [611, 8123.39] - - [2944, 1024, 1, 1280] - - [515, 8450.41] + - [621, 8450.41] - - [5056, 4, 1, 256] - - [529, 380.787] + - [635, 380.787] - - [3584, 5888, 1, 3328] - - [507, 8567.99] + - [613, 8567.99] - - [2368, 4288, 1, 256] - - [511, 7858.07] + - [617, 7858.07] - - [1024, 2368, 1, 3328] - - [505, 6776.45] + - [611, 6776.45] - - [64, 704, 1, 3328] - - [460, 3503.52] + - [566, 3503.52] - - [704, 1408, 1, 256] - - [505, 6099.99] + - [611, 6099.99] - - [4096, 128, 1, 4096] - - [485, 4116.57] + - [591, 4116.57] - - [1024, 3584, 1, 1280] - - [515, 7231.65] + - [621, 7231.65] - - [4288, 5888, 1, 3328] - - [509, 8762.42] + - [615, 8762.42] - - [4288, 4, 1, 1280] - - [462, 492.797] + - [568, 492.797] - - [4608, 16, 1, 1536] - - [463, 1892.58] + - [569, 1892.58] - - [5888, 64, 1, 128] - - [440, 1747.73] + - [546, 1747.73] - - [4, 5888, 1, 128] - - [525, 84.5915] + - [631, 84.5915] - - [1024, 2944, 1, 3328] - - [513, 6907.05] + - [619, 6907.05] - - [6784, 1856, 1, 256] - - [509, 6274.07] + - [615, 6274.07] - - [2048, 64, 1, 2048] - - [492, 2371.44] + - [598, 2371.44] - - [256, 6784, 1, 1280] - - [509, 7067.04] + - [615, 7067.04] - - [1856, 3584, 1, 256] - - [515, 7706.87] + - [621, 7706.87] - - [128, 448, 1, 3328] - - [460, 3995.93] + - [566, 3995.93] - - [6784, 1856, 1, 128] - - [497, 4459.09] + - [603, 4459.09] - - [4, 448, 1, 256] - - [462, 84.4294] + - [568, 84.4294] - - [5056, 128, 1, 256] - - [510, 4954.5] + - [616, 4954.5] - - [512, 24000, 1, 2816] - - [503, 8994.98] + - [609, 8994.98] - - [256, 5888, 1, 1280] - - [502, 6184.0] + - [608, 6184.0] - - [4, 128, 1, 1280] - - [530, 71.9597] + - [636, 71.9597] - - [16384, 1600, 1, 4096] - - [509, 6921.09] + - [615, 6921.09] - - [6784, 128, 1, 1280] - - [513, 6486.37] + - [619, 6486.37] - - [64, 1408, 1, 256] - - [450, 1647.86] + - [556, 1647.86] - - [2368, 1408, 1, 128] - - [497, 3937.1] + - [603, 3937.1] - - [1856, 448, 1, 256] - - [510, 4635.57] + - [616, 4635.57] - - [1408, 1024, 1, 128] - - [493, 3208.51] + - [599, 3208.51] - - [128, 64, 1, 128] - - [422, 70.192] + - [528, 70.192] - - [6784, 3584, 1, 3328] - - [515, 8466.28] + - [621, 8466.28] - - [1760, 7000, 1, 1760] - - [513, 8149.21] + - [619, 8149.21] - - [2944, 64, 1, 3328] - - [446, 5018.09] + - [552, 5018.09] - - [64, 64, 1, 128] - - [422, 35.5249] + - [528, 35.5249] - - [2368, 5056, 1, 1280] - - [509, 8764.0] + - [615, 8764.0] - - [64, 4, 1, 1280] - - [531, 43.6745] + - [637, 43.6745] - - [1408, 2368, 1, 1280] - - [510, 7660.38] + - [616, 7660.38] - - [128, 1408, 1, 1280] - - [445, 4185.27] + - [551, 4185.27] - - [256, 64, 1, 3328] - - [470, 2071.75] + - [576, 2071.75] - - [704, 4288, 1, 128] - - [493, 4069.18] + - [599, 4069.18] - - [128, 1856, 1, 3328] - - [476, 5776.15] + - [582, 5776.15] - - [2944, 2944, 1, 256] - - [515, 7949.31] + - [621, 7949.31] - - [2944, 4, 1, 1280] - - [462, 483.218] + - [568, 483.218] - - [5888, 4, 1, 256] - - [447, 396.765] + - [553, 396.765] - - [6784, 256, 1, 256] - - [521, 4044.83] + - [627, 4044.83] - - [256, 5056, 1, 3328] - - [504, 7607.37] + - [610, 7607.37] - - [128, 4288, 1, 1280] - - [445, 4958.78] + - [551, 4958.78] - - [5056, 1856, 1, 128] - - [497, 4560.94] + - [603, 4560.94] - - [5056, 1024, 1, 3328] - - [509, 8634.18] + - [615, 8634.18] - - [128, 128, 1, 256] - - [447, 699.151] + - [553, 699.151] - - [1760, 64, 1, 1760] - - [453, 4580.65] + - [559, 4580.65] - - [4288, 3584, 1, 3328] - - [515, 9143.76] + - [621, 9143.76] - - [448, 704, 1, 3328] - - [504, 4473.43] + - [610, 4473.43] - - [448, 448, 1, 128] - - [435, 1264.38] + - [541, 1264.38] - - [1024, 2368, 1, 1280] - - [513, 7452.51] + - [619, 7452.51] - - [1856, 704, 1, 3328] - - [504, 6103.34] + - [610, 6103.34] - - [4, 2368, 1, 128] - - [524, 96.019] + - [630, 96.019] - - [5888, 6784, 1, 3328] - - [509, 9131.74] + - [615, 9131.74] - - [704, 4288, 1, 1280] - - [511, 7906.46] + - [617, 7906.46] - - [704, 256, 1, 256] - - [504, 2772.78] + - [610, 2772.78] - - [1024, 48000, 1, 2048] - - [508, 6513.45] + - [614, 6513.45] - - [4288, 1024, 1, 128] - - [493, 4291.77] - - - [512, 2048, 1, 49] - - [539, 4555.08] - - - [512, 128, 1, 784] - - [532, 3195.39] - - - [2048, 512, 1, 49] - - [540, 4253.43] - - - [1024, 256, 1, 196] - - [536, 4039.43] + - [599, 4291.77] - - [256, 64, 1, 3136] - - [534, 3015.37] + - [640, 3015.37] - - [256, 1024, 1, 196] - - [538, 4225.45] - - - [64, 256, 1, 3136] - - [535, 3058.45] - - - [128, 512, 1, 784] - - [533, 3380.38] - - - [64, 64, 1, 3136] - - [537, 1372.44] + - [644, 4225.45] - - [1024, 1024, 1, 3328] - - [650, 8705.1] + - [756, 8705.1] - - [2048, 200, 1, 3200] - - [655, 6173.42] + - [761, 6173.42] - - [1024, 200, 1, 13312] - - [553, 5213.31] + - [659, 5213.31] - - [1024, 256, 1, 1536] - - [655, 5859.43] + - [761, 5859.43] - - [4096, 256, 1, 12288] - - [660, 8807.52] + - [766, 8807.52] - - [64, 200, 1, 1024] - - [627, 366.632] + - [733, 366.632] - - [32, 512, 1, 1024] - - [582, 453.049] + - [688, 453.049] - - [2048, 256, 1, 3328] - - [644, 7876.73] + - [750, 7876.73] - - [4096, 512, 1, 32] - - [648, 3975.74] + - [754, 3975.74] - - [2048, 256, 1, 13312] - - [625, 7837.81] + - [731, 7837.81] - - [4096, 200, 1, 11264] - - [660, 6902.76] + - [766, 6902.76] - - [2048, 512, 1, 1024] - - [654, 8100.14] + - [760, 8100.14] - - [2048, 1024, 1, 1664] - - [554, 9082.08] + - [660, 9082.08] - - [1024, 1024, 1, 64] - - [650, 4258.28] + - [756, 4258.28] - - [512, 1024, 1, 1536] - - [644, 7597.33] + - [750, 7597.33] - - [1024, 256, 1, 15360] - - [545, 6735.24] + - [651, 6735.24] - - [1, 512, 1, 1024] - - [595, 15.1657] + - [701, 15.1657] - - [4096, 512, 1, 1408] - - [557, 9024.52] + - [663, 9024.52] - - [1024, 200, 1, 1408] - - [655, 4461.09] + - [761, 4461.09] - - [1024, 512, 1, 512] - - [649, 6528.2] + - [755, 6528.2] - - [4096, 256, 1, 15360] - - [656, 8824.03] + - [762, 8824.03] - - [2048, 512, 1, 640] - - [646, 7989.25] + - [752, 7989.25] - - [4096, 1024, 1, 1280] - - [552, 9421.54] + - [658, 9421.54] - - [1024, 200, 1, 6144] - - [644, 4966.52] + - [750, 4966.52] - - [1024, 1024, 1, 512] - - [646, 7731.54] + - [752, 7731.54] - - [128, 512, 1, 2048] - - [562, 2190.34] + - [668, 2190.34] - - [2048, 1024, 1, 640] - - [552, 8581.8] + - [658, 8581.8] - - [1024, 256, 1, 3328] - - [644, 6192.71] + - [750, 6192.71] - - [4096, 1024, 1, 13312] - - [557, 9642.59] + - [663, 9642.59] - - [2048, 256, 1, 2048] - - [644, 7485.75] + - [750, 7485.75] - - [2048, 1024, 1, 13312] - - [557, 9352.26] + - [663, 9352.26] - - [2048, 512, 1, 16640] - - [645, 8839.17] + - [751, 8839.17] - - [1024, 512, 1, 128] - - [649, 4280.0] + - [755, 4280.0] - - [2048, 1024, 1, 3584] - - [552, 9264.72] + - [658, 9264.72] - - [2048, 512, 1, 256] - - [660, 6990.61] + - [766, 6990.61] - - [512, 256, 1, 3200] - - [607, 4154.52] + - [713, 4154.52] - - [4096, 1024, 1, 1920] - - [552, 9535.32] + - [658, 9535.32] - - [4096, 200, 1, 2560] - - [657, 6754.65] + - [763, 6754.65] - - [1024, 256, 1, 16384] - - [547, 6289.6] + - [653, 6289.6] - - [1024, 1024, 1, 1152] - - [650, 8407.39] + - [756, 8407.39] - - [2048, 200, 1, 32] - - [593, 1412.51] + - [699, 1412.51] - - [512, 1024, 1, 2816] - - [644, 7843.25] + - [750, 7843.25] - - [4096, 256, 1, 14336] - - [656, 8844.77] + - [762, 8844.77] - - [1024, 200, 1, 4608] - - [655, 4931.74] + - [761, 4931.74] - - [1024, 200, 1, 16384] - - [550, 5135.15] + - [656, 5135.15] - - [64, 256, 1, 1024] - - [628, 461.013] + - [734, 461.013] - - [1, 200, 1, 1024] - - [610, 7.49884] + - [716, 7.49884] - - [2048, 200, 1, 2080] - - [655, 6033.87] + - [761, 6033.87] - - [512, 256, 1, 1792] - - [565, 3153.71] + - [671, 3153.71] - - [2048, 200, 1, 1024] - - [655, 5711.3] + - [761, 5711.3] - - [4096, 1024, 1, 12288] - - [552, 9658.23] + - [658, 9658.23] - - [4096, 200, 1, 4096] - - [646, 6834.55] + - [752, 6834.55] - - [1024, 512, 1, 11264] - - [613, 7686.46] + - [719, 7686.46] - - [128, 512, 1, 1024] - - [583, 1458.99] + - [689, 1458.99] - - [32, 256, 1, 2048] - - [601, 384.899] + - [707, 384.899] - - [1024, 200, 1, 1792] - - [655, 4638.64] + - [761, 4638.64] - - [1024, 1024, 1, 1792] - - [650, 8550.56] + - [756, 8550.56] - - [32, 256, 1, 512] - - [634, 161.419] + - [740, 161.419] - - [512, 200, 1, 2816] - - [560, 3353.1] + - [666, 3353.1] - - [512, 200, 1, 3072] - - [545, 3298.89] + - [651, 3298.89] - - [1024, 1024, 1, 8192] - - [591, 8369.1] + - [697, 8369.1] - - [1024, 256, 1, 12288] - - [548, 6475.71] + - [654, 6475.71] - - [4096, 200, 1, 768] - - [650, 6367.97] + - [756, 6367.97] - - [1024, 512, 1, 16384] - - [666, 7367.12] + - [772, 7367.12] - - [4096, 256, 1, 1024] - - [646, 8214.16] + - [752, 8214.16] - - [1024, 512, 1, 256] - - [649, 5537.13] + - [755, 5537.13] - - [4096, 1024, 1, 8320] - - [552, 9674.26] + - [658, 9674.26] - - [4096, 256, 1, 9216] - - [654, 8791.02] + - [760, 8791.02] - - [1024, 512, 1, 1408] - - [644, 7459.65] + - [750, 7459.65] - - [1024, 512, 1, 5632] - - [655, 7997.91] + - [761, 7997.91] - - [4096, 200, 1, 256] - - [660, 5371.9] + - [766, 5371.9] - - [1024, 200, 1, 128] - - [638, 1998.15] + - [744, 1998.15] - - [256, 200, 1, 1024] - - [607, 1196.01] + - [713, 1196.01] - - [1024, 200, 1, 5120] - - [655, 4957.44] + - [761, 4957.44] - - [512, 1024, 1, 3072] - - [668, 7104.07] + - [774, 7104.07] - - [4096, 1024, 1, 15360] - - [552, 9669.04] + - [658, 9669.04] - - [1, 256, 1, 2048] - - [594, 13.9262] + - [700, 13.9262] - - [1024, 1024, 1, 4160] - - [646, 8759.3] + - [752, 8759.3] - - [1024, 256, 1, 256] - - [653, 3728.37] + - [759, 3728.37] - - [2048, 256, 1, 384] - - [655, 6123.17] + - [761, 6123.17] - - [512, 256, 1, 2560] - - [609, 3809.64] + - [715, 3809.64] - - [4096, 512, 1, 3072] - - [557, 9215.19] + - [663, 9215.19] - - [1024, 256, 1, 4160] - - [644, 6293.49] + - [750, 6293.49] - - [4096, 512, 1, 13312] - - [554, 9367.32] + - [660, 9367.32] - - [4096, 1024, 1, 3840] - - [552, 9631.57] + - [658, 9631.57] - - [4096, 200, 1, 640] - - [650, 6206.16] + - [756, 6206.16] - - [32, 200, 1, 2048] - - [588, 303.507] + - [694, 303.507] - - [1024, 200, 1, 512] - - [644, 3713.19] + - [750, 3713.19] - - [1024, 1024, 1, 7168] - - [647, 8475.74] + - [753, 8475.74] - - [2048, 1024, 1, 3200] - - [552, 9271.34] + - [658, 9271.34] - - [512, 512, 1, 1536] - - [655, 5832.27] + - [761, 5832.27] - - [4096, 256, 1, 768] - - [660, 8066.07] + - [766, 8066.07] - - [2048, 256, 1, 6656] - - [644, 8034.87] + - [750, 8034.87] - - [1024, 256, 1, 896] - - [644, 5467.54] + - [750, 5467.54] - - [2048, 256, 1, 512] - - [655, 6465.31] + - [761, 6465.31] - - [2048, 200, 1, 3072] - - [655, 6165.78] + - [761, 6165.78] - - [128, 200, 1, 1024] - - [612, 692.87] + - [718, 692.87] - - [4096, 512, 1, 3840] - - [557, 9272.7] + - [663, 9272.7] - - [1024, 200, 1, 3200] - - [655, 4838.85] + - [761, 4838.85] - - [4096, 512, 1, 5632] - - [552, 9335.52] + - [658, 9335.52] - - [4096, 512, 1, 64] - - [587, 5275.95] + - [693, 5275.95] - - [1024, 512, 1, 2816] - - [644, 7816.68] + - [750, 7816.68] - - [4096, 256, 1, 7680] - - [650, 8795.5] + - [756, 8795.5] - - [4096, 200, 1, 1024] - - [660, 6448.91] + - [766, 6448.91] - - [1024, 512, 1, 12288] - - [614, 7624.67] + - [720, 7624.67] - - [2048, 1024, 1, 512] - - [557, 8436.16] + - [663, 8436.16] - - [128, 256, 1, 2048] - - [631, 1342.28] + - [737, 1342.28] - - [2048, 200, 1, 1792] - - [655, 6020.47] + - [761, 6020.47] - - [1024, 1024, 1, 2816] - - [646, 8670.5] + - [752, 8670.5] - - [2048, 512, 1, 1536] - - [657, 8466.32] + - [763, 8466.32] - - [4096, 256, 1, 3072] - - [654, 8631.47] + - [760, 8631.47] - - [1024, 200, 1, 1536] - - [636, 4577.7] + - [742, 4577.7] - - [1024, 256, 1, 1024] - - [644, 5491.82] + - [750, 5491.82] - - [4096, 512, 1, 8192] - - [557, 9325.64] + - [663, 9325.64] - - [128, 1024, 1, 512] - - [655, 2534.42] + - [761, 2534.42] - - [4096, 512, 1, 2304] - - [552, 9193.09] + - [658, 9193.09] - - [2048, 256, 1, 5632] - - [655, 7999.64] + - [761, 7999.64] - - [1024, 256, 1, 5120] - - [655, 6307.32] + - [761, 6307.32] - - [1024, 512, 1, 6656] - - [655, 8028.95] + - [761, 8028.95] - - [4096, 512, 1, 2816] - - [552, 9234.5] + - [658, 9234.5] - - [4096, 200, 1, 2080] - - [639, 6697.96] + - [745, 6697.96] - - [1024, 200, 1, 2304] - - [655, 4752.91] + - [761, 4752.91] - - [2048, 200, 1, 13312] - - [644, 6346.23] + - [750, 6346.23] - - [64, 1024, 1, 1024] - - [628, 1359.68] + - [734, 1359.68] - - [4096, 256, 1, 3584] - - [650, 8668.9] + - [756, 8668.9] - - [2048, 1024, 1, 7680] - - [552, 9365.88] + - [658, 9365.88] - - [1024, 256, 1, 1664] - - [644, 5907.57] + - [750, 5907.57] - - [1, 512, 1, 2048] - - [571, 23.5057] + - [677, 23.5057] - - [512, 512, 1, 1024] - - [644, 5360.23] + - [750, 5360.23] - - [2048, 256, 1, 8192] - - [616, 7665.31] + - [722, 7665.31] - - [2048, 512, 1, 512] - - [646, 7767.33] + - [752, 7767.33] - - [4096, 512, 1, 1920] - - [552, 9133.04] + - [658, 9133.04] - - [4096, 200, 1, 12288] - - [660, 6910.75] + - [766, 6910.75] - - [1024, 512, 1, 3072] - - [590, 7310.43] + - [696, 7310.43] - - [2048, 512, 1, 1152] - - [650, 8342.36] + - [756, 8342.36] - - [1024, 256, 1, 2080] - - [644, 6010.46] + - [750, 6010.46] - - [4096, 1024, 1, 32] - - [640, 4793.59] + - [746, 4793.59] - - [4096, 512, 1, 16640] - - [552, 9365.41] + - [658, 9365.41] - - [2048, 200, 1, 9216] - - [644, 6315.98] + - [750, 6315.98] - - [2048, 200, 1, 2560] - - [644, 6119.24] + - [750, 6119.24] - - [2048, 1024, 1, 1024] - - [552, 8628.69] + - [658, 8628.69] - - [2048, 256, 1, 4608] - - [644, 7951.39] + - [750, 7951.39] - - [512, 200, 1, 768] - - [596, 2132.51] + - [702, 2132.51] - - [128, 256, 1, 512] - - [596, 670.117] + - [702, 670.117] - - [4096, 512, 1, 1792] - - [557, 9127.01] + - [663, 9127.01] - - [4096, 1024, 1, 8192] - - [552, 9591.37] + - [658, 9591.37] - - [1024, 256, 1, 2816] - - [655, 6119.11] + - [761, 6119.11] - - [1024, 1024, 1, 13312] - - [647, 8529.37] + - [753, 8529.37] - - [2048, 1024, 1, 4160] - - [552, 9305.67] + - [658, 9305.67] - - [2048, 256, 1, 3584] - - [644, 7903.23] + - [750, 7903.23] - - [128, 200, 1, 2048] - - [612, 1135.91] + - [718, 1135.91] - - [4096, 512, 1, 10240] - - [554, 9339.59] + - [660, 9339.59] - - [4096, 512, 1, 512] - - [552, 8446.78] + - [658, 8446.78] - - [2048, 1024, 1, 6656] - - [552, 9331.75] + - [658, 9331.75] - - [1024, 512, 1, 640] - - [644, 6776.04] + - [750, 6776.04] - - [2048, 512, 1, 768] - - [646, 8085.51] + - [752, 8085.51] - - [2048, 200, 1, 1408] - - [644, 5880.17] + - [750, 5880.17] - - [4096, 200, 1, 2048] - - [660, 6691.71] + - [766, 6691.71] - - [1024, 1024, 1, 5632] - - [646, 8749.63] + - [752, 8749.63] - - [2048, 512, 1, 3584] - - [650, 8704.23] + - [756, 8704.23] - - [64, 512, 1, 512] - - [586, 667.983] + - [692, 667.983] - - [64, 200, 1, 512] - - [596, 251.388] + - [702, 251.388] - - [1024, 200, 1, 64] - - [551, 1310.82] + - [657, 1310.82] - - [512, 512, 1, 2304] - - [644, 6078.8] + - [750, 6078.8] - - [2048, 1024, 1, 14336] - - [552, 9321.94] + - [658, 9321.94] - - [4096, 512, 1, 11264] - - [554, 9339.95] + - [660, 9339.95] - - [4096, 512, 1, 128] - - [639, 6566.53] + - [745, 6566.53] - - [1024, 512, 1, 64] - - [659, 2953.84] + - [765, 2953.84] - - [4096, 512, 1, 768] - - [552, 8738.23] + - [658, 8738.23] - - [4096, 1024, 1, 11264] - - [552, 9637.78] + - [658, 9637.78] - - [1, 256, 1, 1024] - - [642, 8.93234] + - [748, 8.93234] - - [4096, 200, 1, 7680] - - [639, 6889.57] + - [745, 6889.57] - - [1024, 200, 1, 12288] - - [611, 5237.74] + - [717, 5237.74] - - [1024, 1024, 1, 1280] - - [646, 8418.17] + - [752, 8418.17] - - [4096, 1024, 1, 16640] - - [552, 9675.01] + - [658, 9675.01] - - [2048, 1024, 1, 5632] - - [552, 9327.85] + - [658, 9327.85] - - [1024, 200, 1, 15360] - - [611, 5386.63] + - [717, 5386.63] - - [1, 1024, 1, 1024] - - [661, 27.3499] + - [767, 27.3499] - - [2048, 256, 1, 16384] - - [622, 7652.75] + - [728, 7652.75] - - [4096, 512, 1, 12288] - - [554, 9359.51] + - [660, 9359.51] - - [2048, 200, 1, 896] - - [655, 5628.96] + - [761, 5628.96] - - [4096, 1024, 1, 5632] - - [552, 9626.78] + - [658, 9626.78] - - [2048, 256, 1, 32] - - [648, 1889.43] + - [754, 1889.43] - - [2048, 256, 1, 1280] - - [644, 7390.94] + - [750, 7390.94] - - [4096, 256, 1, 4096] - - [646, 8694.37] + - [752, 8694.37] - - [2048, 256, 1, 11264] - - [644, 8113.95] + - [750, 8113.95] - - [4096, 200, 1, 9216] - - [646, 6891.08] + - [752, 6891.08] - - [1024, 512, 1, 4096] - - [592, 7348.46] + - [698, 7348.46] - - [2048, 1024, 1, 10240] - - [554, 9095.91] + - [660, 9095.91] - - [4096, 1024, 1, 640] - - [552, 9115.68] + - [658, 9115.68] - - [128, 1024, 1, 2048] - - [545, 3270.51] + - [651, 3270.51] - - [4096, 200, 1, 3840] - - [639, 6836.26] + - [745, 6836.26] - - [1024, 1024, 1, 1920] - - [650, 8562.82] + - [756, 8562.82] - - [2048, 200, 1, 7168] - - [655, 6296.23] + - [761, 6296.23] - - [2048, 512, 1, 16384] - - [546, 8632.51] + - [652, 8632.51] - - [2048, 1024, 1, 12288] - - [552, 9158.08] + - [658, 9158.08] - - [4096, 1024, 1, 10240] - - [552, 9658.84] + - [658, 9658.84] - - [1024, 1024, 1, 8320] - - [654, 8799.58] + - [760, 8799.58] - - [1024, 256, 1, 9216] - - [644, 6375.23] + - [750, 6375.23] - - [4096, 256, 1, 1152] - - [639, 8301.09] + - [745, 8301.09] - - [512, 200, 1, 2560] - - [605, 3088.51] + - [711, 3088.51] - - [2048, 256, 1, 1920] - - [644, 7714.94] + - [750, 7714.94] - - [2048, 1024, 1, 4608] - - [552, 9305.7] + - [658, 9305.7] - - [512, 256, 1, 1024] - - [652, 2887.74] + - [758, 2887.74] - - [1024, 256, 1, 1920] - - [636, 5913.12] + - [742, 5913.12] - - [4096, 512, 1, 3584] - - [552, 9275.69] + - [658, 9275.69] - - [2048, 512, 1, 4160] - - [657, 8734.03] + - [763, 8734.03] - - [2048, 512, 1, 5632] - - [660, 8758.98] + - [766, 8758.98] - - [4096, 1024, 1, 4608] - - [552, 9657.22] + - [658, 9657.22] - - [4096, 1024, 1, 3328] - - [552, 9621.45] + - [658, 9621.45] - - [4096, 256, 1, 7168] - - [646, 8770.05] + - [752, 8770.05] - - [4096, 200, 1, 128] - - [660, 4458.33] + - [766, 4458.33] - - [2048, 200, 1, 5120] - - [644, 6176.91] + - [750, 6176.91] - - [1024, 1024, 1, 6656] - - [646, 8780.45] + - [752, 8780.45] - - [512, 1024, 1, 3200] - - [655, 7887.09] + - [761, 7887.09] - - [512, 200, 1, 2304] - - [545, 2991.09] + - [651, 2991.09] - - [2048, 1024, 1, 9216] - - [557, 9325.46] + - [663, 9325.46] - - [2048, 256, 1, 1536] - - [655, 7551.73] + - [761, 7551.73] - - [4096, 256, 1, 256] - - [660, 6932.83] + - [766, 6932.83] - - [2048, 512, 1, 1408] - - [657, 8430.86] + - [763, 8430.86] - - [1024, 256, 1, 384] - - [649, 4462.13] + - [755, 4462.13] - - [2048, 1024, 1, 2304] - - [552, 9174.94] + - [658, 9174.94] - - [4096, 512, 1, 6144] - - [554, 9284.25] + - [660, 9284.25] - - [1024, 200, 1, 14336] - - [543, 5268.57] + - [649, 5268.57] - - [1024, 512, 1, 2080] - - [655, 7736.47] + - [761, 7736.47] - - [2048, 512, 1, 2304] - - [657, 8616.07] + - [763, 8616.07] - - [4096, 512, 1, 15360] - - [557, 9362.17] + - [663, 9362.17] - - [1024, 256, 1, 32] - - [577, 1028.12] + - [683, 1028.12] - - [1024, 200, 1, 2816] - - [655, 4780.58] + - [761, 4780.58] - - [4096, 200, 1, 512] - - [646, 6054.23] + - [752, 6054.23] - - [4096, 1024, 1, 7168] - - [557, 9468.49] + - [663, 9468.49] - - [2048, 256, 1, 14336] - - [618, 7865.52] + - [724, 7865.52] - - [1024, 200, 1, 3072] - - [655, 4804.2] + - [761, 4804.2] - - [2048, 200, 1, 1280] - - [655, 5846.31] + - [761, 5846.31] - - [1024, 1024, 1, 2304] - - [646, 8633.32] + - [752, 8633.32] - - [4096, 1024, 1, 9216] - - [552, 9641.03] + - [658, 9641.03] - - [2048, 512, 1, 4608] - - [657, 8743.3] + - [763, 8743.3] - - [4096, 1024, 1, 7680] - - [552, 9684.86] + - [658, 9684.86] - - [4096, 256, 1, 6144] - - [657, 8757.24] + - [763, 8757.24] - - [4096, 256, 1, 896] - - [650, 8258.93] + - [756, 8258.93] - - [512, 256, 1, 1536] - - [634, 3065.36] + - [740, 3065.36] - - [1024, 256, 1, 512] - - [644, 4752.85] + - [750, 4752.85] - - [2048, 256, 1, 640] - - [644, 6776.04] + - [750, 6776.04] - - [256, 256, 1, 2048] - - [581, 2249.06] + - [687, 2249.06] - - [2048, 1024, 1, 8192] - - [552, 9178.17] + - [658, 9178.17] - - [4096, 200, 1, 16640] - - [544, 7009.59] + - [650, 7009.59] - - [256, 512, 1, 512] - - [556, 2511.66] + - [662, 2511.66] - - [2048, 512, 1, 384] - - [657, 7467.7] + - [763, 7467.7] - - [2048, 200, 1, 16384] - - [625, 6327.31] + - [731, 6327.31] - - [4096, 200, 1, 10240] - - [650, 6892.74] + - [756, 6892.74] - - [1024, 512, 1, 9216] - - [599, 7530.09] + - [705, 7530.09] - - [4096, 1024, 1, 64] - - [574, 6260.26] + - [680, 6260.26] - - [4096, 200, 1, 1920] - - [660, 6710.27] + - [766, 6710.27] - - [2048, 1024, 1, 1280] - - [552, 8998.34] + - [658, 8998.34] - - [1024, 200, 1, 3840] - - [644, 4873.87] + - [750, 4873.87] - - [256, 1024, 1, 512] - - [655, 4766.35] + - [761, 4766.35] - - [2048, 1024, 1, 3328] - - [552, 9275.2] + - [658, 9275.2] - - [1024, 256, 1, 16640] - - [609, 6837.22] + - [715, 6837.22] - - [4096, 512, 1, 14336] - - [557, 9354.42] + - [663, 9354.42] - - [1024, 1024, 1, 16640] - - [654, 8832.37] + - [760, 8832.37] - - [1024, 256, 1, 1152] - - [655, 5642.66] + - [761, 5642.66] - - [512, 512, 1, 512] - - [644, 4779.93] + - [750, 4779.93] - - [4096, 512, 1, 8320] - - [557, 9327.96] + - [663, 9327.96] - - [2048, 512, 1, 7680] - - [660, 8793.96] + - [766, 8793.96] - - [4096, 1024, 1, 6656] - - [552, 9667.03] + - [658, 9667.03] - - [1024, 512, 1, 3584] - - [655, 7900.57] + - [761, 7900.57] - - [1024, 1024, 1, 32] - - [640, 2974.78] + - [746, 2974.78] - - [512, 512, 1, 2816] - - [636, 6155.85] + - [742, 6155.85] - - [2048, 512, 1, 1664] - - [660, 8496.55] + - [766, 8496.55] - - [1024, 1024, 1, 14336] - - [546, 8624.74] + - [652, 8624.74] - - [2048, 200, 1, 2048] - - [655, 6029.86] + - [761, 6029.86] - - [1024, 1024, 1, 3584] - - [646, 8702.62] + - [752, 8702.62] - - [512, 200, 1, 1280] - - [560, 2350.75] + - [666, 2350.75] - - [4096, 256, 1, 6656] - - [660, 8788.41] + - [766, 8788.41] - - [4096, 256, 1, 4160] - - [637, 8728.44] + - [743, 8728.44] - - [128, 256, 1, 1024] - - [619, 859.589] + - [725, 859.589] - - [512, 200, 1, 3200] - - [560, 3376.85] + - [666, 3376.85] - - [2048, 512, 1, 9216] - - [643, 8806.4] + - [749, 8806.4] - - [2048, 1024, 1, 256] - - [639, 7713.76] + - [745, 7713.76] - - [1024, 256, 1, 2304] - - [655, 6015.83] + - [761, 6015.83] - - [1024, 200, 1, 8192] - - [655, 5022.02] + - [761, 5022.02] - - [2048, 256, 1, 3072] - - [572, 7515.09] + - [678, 7515.09] - - [2048, 256, 1, 8320] - - [644, 8063.68] + - [750, 8063.68] - - [4096, 512, 1, 1024] - - [554, 8824.41] + - [660, 8824.41] - - [1024, 512, 1, 3200] - - [644, 7866.39] + - [750, 7866.39] - - [1024, 512, 1, 896] - - [636, 7161.11] + - [742, 7161.11] - - [2048, 512, 1, 1280] - - [650, 8384.52] + - [756, 8384.52] - - [4096, 200, 1, 64] - - [559, 3260.6] + - [665, 3260.6] - - [1024, 256, 1, 6144] - - [665, 6143.72] + - [771, 6143.72] - - [1024, 200, 1, 2560] - - [644, 4762.89] + - [750, 4762.89] - - [1024, 1024, 1, 5120] - - [573, 8454.23] + - [679, 8454.23] - - [2048, 512, 1, 6656] - - [650, 8799.05] + - [756, 8799.05] - - [4096, 1024, 1, 1536] - - [552, 9503.37] + - [658, 9503.37] - - [1024, 1024, 1, 128] - - [575, 5825.52] + - [681, 5825.52] - - [512, 1024, 1, 1792] - - [644, 7701.12] + - [750, 7701.12] - - [2048, 1024, 1, 32] - - [555, 3938.41] + - [661, 3938.41] - - [4096, 256, 1, 2816] - - [639, 8652.2] + - [745, 8652.2] - - [1024, 1024, 1, 15360] - - [546, 8719.7] + - [652, 8719.7] - - [1024, 256, 1, 5632] - - [644, 6344.18] + - [750, 6344.18] - - [1024, 1024, 1, 4096] - - [647, 8187.86] + - [753, 8187.86] - - [2048, 200, 1, 4160] - - [655, 6222.48] + - [761, 6222.48] - - [512, 256, 1, 768] - - [586, 2771.67] + - [692, 2771.67] - - [4096, 512, 1, 640] - - [557, 8590.58] + - [663, 8590.58] - - [2048, 512, 1, 8192] - - [599, 8494.9] + - [705, 8494.9] - - [1024, 512, 1, 768] - - [644, 7049.35] + - [750, 7049.35] - - [4096, 200, 1, 8320] - - [639, 6908.7] + - [745, 6908.7] - - [2048, 512, 1, 896] - - [646, 8224.23] + - [752, 8224.23] - - [4096, 200, 1, 7168] - - [657, 6878.59] + - [763, 6878.59] - - [2048, 512, 1, 13312] - - [645, 8803.04] + - [751, 8803.04] - - [64, 512, 1, 1024] - - [549, 844.024] + - [655, 844.024] - - [2048, 200, 1, 3840] - - [644, 6192.48] + - [750, 6192.48] - - [1024, 1024, 1, 768] - - [637, 8098.51] + - [743, 8098.51] - - [4096, 512, 1, 16384] - - [557, 9345.73] + - [663, 9345.73] - - [4096, 256, 1, 2304] - - [637, 8596.45] + - [743, 8596.45] - - [1, 256, 1, 4096] - - [642, 19.9293] + - [748, 19.9293] - - [1024, 1024, 1, 11264] - - [647, 8491.48] + - [753, 8491.48] - - [2048, 200, 1, 16640] - - [641, 6510.64] + - [747, 6510.64] - - [1024, 256, 1, 3072] - - [655, 6179.55] + - [761, 6179.55] - - [4096, 1024, 1, 512] - - [552, 9032.25] + - [658, 9032.25] - - [2048, 256, 1, 2816] - - [644, 7793.57] + - [750, 7793.57] - - [32, 512, 1, 512] - - [556, 318.816] + - [662, 318.816] - - [256, 512, 1, 2048] - - [607, 3369.02] + - [713, 3369.02] - - [1024, 512, 1, 384] - - [655, 6198.58] + - [761, 6198.58] - - [2048, 200, 1, 7680] - - [644, 6307.7] + - [750, 6307.7] - - [1024, 512, 1, 4608] - - [655, 7953.48] + - [761, 7953.48] - - [4096, 200, 1, 32] - - [604, 2199.29] + - [710, 2199.29] - - [4096, 200, 1, 3328] - - [639, 6813.12] + - [745, 6813.12] - - [1024, 200, 1, 1152] - - [644, 4375.65] + - [750, 4375.65] - - [1024, 1024, 1, 1408] - - [646, 8457.91] + - [752, 8457.91] - - [2048, 200, 1, 15360] - - [620, 6333.1] + - [726, 6333.1] - - [512, 1024, 1, 2048] - - [630, 6280.76] + - [736, 6280.76] - - [1024, 512, 1, 1024] - - [655, 7064.19] + - [761, 7064.19] - - [1024, 200, 1, 10240] - - [644, 5030.69] + - [750, 5030.69] - - [4096, 256, 1, 5632] - - [657, 8765.22] + - [763, 8765.22] - - [512, 512, 1, 3072] - - [667, 5942.44] + - [773, 5942.44] - - [2048, 256, 1, 1408] - - [644, 7545.05] + - [750, 7545.05] - - [2048, 256, 1, 6144] - - [655, 7963.97] + - [761, 7963.97] - - [4096, 256, 1, 3328] - - [650, 8682.58] + - [756, 8682.58] - - [1024, 200, 1, 1664] - - [644, 4595.4] + - [750, 4595.4] - - [2048, 1024, 1, 1152] - - [552, 8942.65] + - [658, 8942.65] - - [2048, 512, 1, 6144] - - [645, 8729.71] + - [751, 8729.71] - - [2048, 512, 1, 3200] - - [646, 8696.56] + - [752, 8696.56] - - [4096, 1024, 1, 2080] - - [585, 9538.45] + - [691, 9538.45] - - [4096, 1024, 1, 768] - - [552, 9260.75] + - [658, 9260.75] - - [4096, 1024, 1, 2560] - - [552, 9567.27] + - [658, 9567.27] - - [64, 200, 1, 2048] - - [584, 583.161] + - [690, 583.161] - - [2048, 200, 1, 4608] - - [655, 6243.28] + - [761, 6243.28] - - [1024, 1024, 1, 6144] - - [647, 8320.25] + - [753, 8320.25] - - [4096, 256, 1, 1664] - - [650, 8503.17] + - [756, 8503.17] - - [2048, 200, 1, 384] - - [655, 4940.0] + - [761, 4940.0] - - [1, 200, 1, 2048] - - [601, 11.3281] + - [707, 11.3281] - - [4096, 256, 1, 1792] - - [660, 8504.12] + - [766, 8504.12] - - [2048, 1024, 1, 64] - - [574, 5309.35] + - [680, 5309.35] - - [4096, 1024, 1, 16384] - - [541, 9428.61] + - [647, 9428.61] - - [1024, 512, 1, 16640] - - [655, 8122.55] + - [761, 8122.55] - - [2048, 512, 1, 10240] - - [645, 8766.21] + - [751, 8766.21] - - [4096, 512, 1, 6656] - - [552, 9351.75] + - [658, 9351.75] - - [2048, 256, 1, 16640] - - [644, 8135.27] + - [750, 8135.27] - - [2048, 512, 1, 2816] - - [646, 8660.32] + - [752, 8660.32] - - [1024, 200, 1, 32] - - [564, 780.291] + - [670, 780.291] - - [1, 512, 1, 4096] - - [589, 34.8671] + - [695, 34.8671] - - [256, 256, 1, 1024] - - [596, 1490.08] + - [702, 1490.08] - - [2048, 1024, 1, 128] - - [569, 6605.3] + - [675, 6605.3] - - [2048, 1024, 1, 2080] - - [552, 9159.51] + - [658, 9159.51] - - [2048, 1024, 1, 16640] - - [552, 9371.65] + - [658, 9371.65] - - [1024, 200, 1, 384] - - [655, 3378.24] + - [761, 3378.24] - - [4096, 256, 1, 384] - - [600, 7369.3] + - [706, 7369.3] - - [4096, 256, 1, 13312] - - [654, 8776.48] + - [760, 8776.48] - - [2048, 256, 1, 128] - - [649, 4280.0] + - [755, 4280.0] - - [512, 256, 1, 2304] - - [561, 3584.98] + - [667, 3584.98] - - [2048, 1024, 1, 3072] - - [554, 9156.52] + - [660, 9156.52] - - [1024, 1024, 1, 640] - - [650, 7928.84] + - [756, 7928.84] - - [256, 512, 1, 1024] - - [655, 2843.7] + - [761, 2843.7] - - [4096, 1024, 1, 1408] - - [552, 9437.56] + - [658, 9437.56] - - [4096, 200, 1, 5632] - - [657, 6873.96] + - [763, 6873.96] - - [4096, 1024, 1, 2048] - - [552, 9437.1] + - [658, 9437.1] - - [2048, 1024, 1, 2560] - - [557, 9195.62] + - [663, 9195.62] - - [4096, 1024, 1, 128] - - [639, 7407.26] + - [745, 7407.26] - - [1024, 200, 1, 3328] - - [655, 4857.39] + - [761, 4857.39] - - [2048, 200, 1, 1152] - - [644, 5760.1] + - [750, 5760.1] - - [1024, 200, 1, 9216] - - [543, 5053.21] + - [649, 5053.21] - - [4096, 256, 1, 512] - - [637, 7617.45] + - [743, 7617.45] - - [4096, 1024, 1, 14336] - - [552, 9665.12] + - [658, 9665.12] - - [1024, 1024, 1, 384] - - [575, 7478.8] + - [681, 7478.8] - - [2048, 200, 1, 512] - - [644, 5150.28] + - [750, 5150.28] - - [2048, 256, 1, 9216] - - [623, 7717.71] + - [729, 7717.71] - - [2048, 256, 1, 1792] - - [644, 7655.94] + - [750, 7655.94] - - [4096, 512, 1, 9216] - - [554, 9331.22] + - [660, 9331.22] - - [4096, 200, 1, 15360] - - [544, 6958.14] + - [650, 6958.14] - - [1024, 512, 1, 2048] - - [643, 7067.91] + - [749, 7067.91] - - [64, 256, 1, 2048] - - [568, 723.256] + - [674, 723.256] - - [4096, 200, 1, 1792] - - [646, 6699.65] + - [752, 6699.65] - - [1, 200, 1, 4096] - - [578, 15.6387] + - [684, 15.6387] - - [2048, 1024, 1, 2048] - - [557, 9071.93] + - [663, 9071.93] - - [1024, 200, 1, 2080] - - [636, 4679.19] + - [742, 4679.19] - - [2048, 200, 1, 1536] - - [655, 5939.92] + - [761, 5939.92] - - [1024, 1024, 1, 3072] - - [617, 8333.15] + - [723, 8333.15] - - [512, 200, 1, 1792] - - [542, 2679.73] + - [648, 2679.73] - - [1024, 256, 1, 11264] - - [545, 6470.98] + - [651, 6470.98] - - [2048, 512, 1, 12288] - - [592, 8729.24] + - [698, 8729.24] - - [1024, 256, 1, 1792] - - [655, 5931.44] + - [761, 5931.44] - - [1024, 200, 1, 7168] - - [655, 4970.33] + - [761, 4970.33] - - [32, 256, 1, 1024] - - [566, 237.334] + - [672, 237.334] - - [512, 256, 1, 3072] - - [609, 3813.1] + - [715, 3813.1] - - [1024, 1024, 1, 2080] - - [646, 8600.41] + - [752, 8600.41] - - [2048, 200, 1, 2304] - - [655, 6093.32] + - [761, 6093.32] - - [4096, 512, 1, 1536] - - [552, 9075.0] + - [658, 9075.0] - - [2048, 256, 1, 7168] - - [655, 7895.26] + - [761, 7895.26] - - [2048, 512, 1, 1792] - - [657, 8531.92] + - [763, 8531.92] - - [1024, 200, 1, 2048] - - [644, 4685.43] + - [750, 4685.43] - - [1024, 1024, 1, 4608] - - [650, 8735.71] + - [756, 8735.71] - - [4096, 256, 1, 8192] - - [646, 8782.55] + - [752, 8782.55] - - [512, 1024, 1, 1280] - - [636, 7483.25] + - [742, 7483.25] - - [2048, 1024, 1, 16384] - - [546, 8878.96] + - [652, 8878.96] - - [512, 512, 1, 1280] - - [644, 5745.72] + - [750, 5745.72] - - [1024, 200, 1, 1280] - - [636, 4446.23] - - - [4096, 512, 1, 4096] - - [554, 9264.49] + - [742, 4446.23] - - [2048, 256, 1, 3200] - - [644, 7842.85] + - [750, 7842.85] - - [2048, 512, 1, 15360] - - [592, 8757.24] + - [698, 8757.24] - - [1024, 512, 1, 3328] - - [644, 7854.04] + - [750, 7854.04] - - [1024, 512, 1, 4160] - - [644, 7934.61] + - [750, 7934.61] - - [4096, 200, 1, 6656] - - [646, 6883.3] + - [752, 6883.3] - - [4096, 1024, 1, 1024] - - [552, 9229.44] + - [658, 9229.44] - - [2048, 200, 1, 3328] - - [655, 6182.74] + - [761, 6182.74] - - [1024, 1024, 1, 256] - - [575, 6932.83] + - [681, 6932.83] - - [512, 200, 1, 512] - - [596, 1910.77] + - [702, 1910.77] - - [2048, 256, 1, 64] - - [567, 2912.81] + - [673, 2912.81] - - [1024, 256, 1, 2560] - - [644, 6123.17] + - [750, 6123.17] - - [2048, 512, 1, 11264] - - [656, 8728.94] + - [762, 8728.94] - - [32, 200, 1, 1024] - - [651, 187.56] + - [757, 187.56] - - [32, 512, 1, 2048] - - [595, 694.521] + - [701, 694.521] - - [2048, 256, 1, 2304] - - [644, 7759.35] + - [750, 7759.35] - - [2048, 256, 1, 12288] - - [623, 7726.35] + - [729, 7726.35] - - [4096, 200, 1, 8192] - - [646, 6870.94] + - [752, 6870.94] - - [1024, 512, 1, 7168] - - [592, 7479.2] + - [698, 7479.2] - - [1024, 512, 1, 1792] - - [644, 7626.11] + - [750, 7626.11] - - [4096, 1024, 1, 1664] - - [552, 9503.54] + - [658, 9503.54] - - [4096, 200, 1, 2816] - - [639, 6775.44] + - [745, 6775.44] - - [1024, 1024, 1, 896] - - [646, 8229.99] + - [752, 8229.99] - - [1024, 200, 1, 8320] - - [607, 5173.58] + - [713, 5173.58] - - [1024, 1024, 1, 12288] - - [647, 8463.21] + - [753, 8463.21] - - [1024, 256, 1, 8320] - - [636, 6404.37] + - [742, 6404.37] - - [1024, 200, 1, 1024] - - [644, 4297.54] + - [750, 4297.54] - - [1024, 200, 1, 16640] - - [606, 5499.51] + - [712, 5499.51] - - [4096, 256, 1, 5120] - - [660, 8729.15] + - [766, 8729.15] - - [1024, 256, 1, 3200] - - [655, 6124.96] + - [761, 6124.96] - - [512, 512, 1, 2560] - - [655, 6109.79] + - [761, 6109.79] - - [4096, 256, 1, 2048] - - [660, 8511.05] + - [766, 8511.05] - - [1024, 256, 1, 640] - - [644, 5102.66] + - [750, 5102.66] - - [2048, 256, 1, 5120] - - [572, 7667.93] + - [678, 7667.93] - - [2048, 256, 1, 7680] - - [655, 8054.45] + - [761, 8054.45] - - [4096, 512, 1, 384] - - [650, 8190.77] + - [756, 8190.77] - - [2048, 200, 1, 3584] - - [644, 6166.12] + - [750, 6166.12] - - [1024, 512, 1, 1536] - - [644, 7517.9] + - [750, 7517.9] - - [4096, 512, 1, 3328] - - [552, 9259.45] + - [658, 9259.45] - - [4096, 1024, 1, 256] - - [552, 8341.79] + - [658, 8341.79] - - [2048, 200, 1, 64] - - [615, 2307.71] + - [721, 2307.71] - - [2048, 200, 1, 4096] - - [655, 6212.04] + - [761, 6212.04] - - [1024, 1024, 1, 1536] - - [646, 8484.15] + - [752, 8484.15] - - [2048, 1024, 1, 7168] - - [554, 9315.24] + - [660, 9315.24] - - [1024, 256, 1, 3584] - - [644, 6207.32] + - [750, 6207.32] - - [4096, 256, 1, 32] - - [648, 2892.72] + - [754, 2892.72] - - [4096, 256, 1, 1280] - - [657, 8392.9] + - [763, 8392.9] - - [512, 512, 1, 3200] - - [655, 6219.41] + - [761, 6219.41] - - [2048, 1024, 1, 1536] - - [554, 9052.55] + - [660, 9052.55] - - [2048, 256, 1, 1024] - - [644, 7192.9] + - [750, 7192.9] - - [128, 200, 1, 512] - - [634, 502.677] + - [740, 502.677] - - [4096, 512, 1, 7168] - - [557, 9329.11] + - [663, 9329.11] - - [1024, 512, 1, 1152] - - [644, 7358.53] + - [750, 7358.53] - - [64, 1024, 1, 2048] - - [562, 2102.51] + - [668, 2102.51] - - [2048, 512, 1, 3328] - - [646, 8694.69] + - [752, 8694.69] - - [4096, 1024, 1, 896] - - [552, 9343.02] + - [658, 9343.02] - - [1, 1024, 1, 2048] - - [602, 40.9324] + - [708, 40.9324] - - [4096, 200, 1, 3584] - - [650, 6810.3] + - [756, 6810.3] - - [4096, 1024, 1, 4096] - - [552, 9347.56] + - [658, 9347.56] - - [1024, 256, 1, 14336] - - [545, 6625.8] + - [651, 6625.8] - - [2048, 200, 1, 256] - - [644, 4413.3] + - [750, 4413.3] - - [4096, 256, 1, 16384] - - [546, 8752.13] + - [652, 8752.13] - - [4096, 256, 1, 1920] - - [637, 8533.78] + - [743, 8533.78] - - [32, 1024, 1, 512] - - [635, 647.369] + - [741, 647.369] - - [1024, 256, 1, 7680] - - [655, 6387.36] + - [761, 6387.36] - - [2048, 256, 1, 1664] - - [655, 7631.44] + - [761, 7631.44] - - [512, 200, 1, 1536] - - [560, 2576.88] + - [666, 2576.88] - - [2048, 1024, 1, 6144] - - [541, 9033.77] + - [647, 9033.77] - - [512, 256, 1, 2816] - - [607, 3977.46] + - [713, 3977.46] - - [4096, 512, 1, 4160] - - [554, 9289.02] + - [660, 9289.02] - - [4096, 512, 1, 2080] - - [633, 9150.28] + - [739, 9150.28] - - [2048, 256, 1, 15360] - - [618, 7963.97] + - [724, 7963.97] - - [4096, 200, 1, 5120] - - [657, 6861.62] + - [763, 6861.62] - - [1024, 512, 1, 8192] - - [643, 7473.25] + - [749, 7473.25] - - [4096, 200, 1, 896] - - [660, 6443.25] + - [766, 6443.25] - - [2048, 512, 1, 8320] - - [650, 8810.24] + - [756, 8810.24] - - [1024, 1024, 1, 10240] - - [658, 8436.7] + - [764, 8436.7] - - [1024, 200, 1, 768] - - [644, 4087.58] + - [750, 4087.58] - - [2048, 200, 1, 640] - - [655, 5416.3] + - [761, 5416.3] - - [512, 200, 1, 2048] - - [609, 2702.62] + - [715, 2702.62] - - [1024, 1024, 1, 9216] - - [647, 8499.08] + - [753, 8499.08] - - [4096, 200, 1, 1408] - - [657, 6613.82] + - [763, 6613.82] - - [1024, 256, 1, 13312] - - [545, 6643.54] + - [651, 6643.54] - - [1024, 256, 1, 128] - - [576, 2706.1] + - [682, 2706.1] - - [2048, 200, 1, 5632] - - [655, 6270.12] + - [761, 6270.12] - - [64, 1024, 1, 512] - - [634, 1310.82] + - [740, 1310.82] - - [1024, 512, 1, 2560] - - [655, 7731.54] + - [761, 7731.54] - - [4096, 200, 1, 1280] - - [637, 6566.83] + - [743, 6566.83] - - [1024, 200, 1, 4096] - - [655, 4911.46] + - [761, 4911.46] - - [1024, 1024, 1, 2560] - - [646, 8630.35] + - [752, 8630.35] - - [2048, 512, 1, 64] - - [650, 4152.88] + - [756, 4152.88] - - [2048, 200, 1, 8192] - - [644, 6234.21] + - [750, 6234.21] - - [2048, 512, 1, 3072] - - [654, 8614.85] + - [760, 8614.85] - - [4096, 1024, 1, 5120] - - [552, 9573.75] + - [658, 9573.75] - - [4096, 256, 1, 640] - - [639, 7913.88] + - [745, 7913.88] - - [1024, 256, 1, 1280] - - [644, 5706.64] + - [750, 5706.64] - - [2048, 1024, 1, 1920] - - [554, 9141.34] + - [660, 9141.34] - - [2048, 256, 1, 4096] - - [644, 7937.28] + - [750, 7937.28] - - [2048, 1024, 1, 15360] - - [557, 9351.96] + - [663, 9351.96] - - [4096, 200, 1, 16384] - - [546, 6975.21] + - [652, 6975.21] - - [1, 1024, 1, 4096] - - [664, 60.7815] + - [770, 60.7815] - - [4096, 1024, 1, 2816] - - [552, 9583.98] + - [658, 9583.98] - - [4096, 200, 1, 1664] - - [639, 6658.7] + - [745, 6658.7] - - [4096, 512, 1, 256] - - [570, 7731.54] + - [676, 7731.54] - - [1024, 200, 1, 896] - - [644, 4193.45] + - [750, 4193.45] - - [2048, 200, 1, 6656] - - [655, 6291.17] + - [761, 6291.17] - - [2048, 1024, 1, 5120] - - [554, 9270.57] + - [660, 9270.57] - - [512, 1024, 1, 768] - - [644, 7099.06] + - [750, 7099.06] - - [2048, 512, 1, 14336] - - [624, 8559.13] + - [730, 8559.13] - - [2048, 200, 1, 8320] - - [644, 6314.72] + - [750, 6314.72] - - [4096, 256, 1, 3840] - - [660, 8718.56] + - [766, 8718.56] - - [2048, 1024, 1, 4096] - - [541, 8973.38] + - [647, 8973.38] - - [1024, 1024, 1, 3200] - - [650, 8701.98] + - [756, 8701.98] - - [1024, 256, 1, 4608] - - [644, 6268.05] + - [750, 6268.05] - - [4096, 512, 1, 4608] - - [552, 9316.47] + - [658, 9316.47] - - [2048, 512, 1, 2048] - - [643, 8462.76] + - [749, 8462.76] - - [4096, 512, 1, 1664] - - [552, 9074.53] + - [658, 9074.53] - - [4096, 256, 1, 4608] - - [639, 8718.05] + - [745, 8718.05] - - [1024, 512, 1, 32] - - [632, 1807.99] + - [738, 1807.99] - - [1024, 512, 1, 3840] - - [644, 7936.34] + - [750, 7936.34] - - [2048, 512, 1, 1920] - - [660, 8548.27] + - [766, 8548.27] - - [2048, 1024, 1, 896] - - [552, 8843.51] + - [658, 8843.51] - - [4096, 200, 1, 6144] - - [660, 6864.76] + - [766, 6864.76] - - [1024, 512, 1, 13312] - - [613, 7763.19] + - [719, 7763.19] - - [4096, 1024, 1, 4160] - - [552, 9650.72] + - [658, 9650.72] - - [2048, 200, 1, 2816] - - [644, 6119.76] + - [750, 6119.76] - - [1024, 1024, 1, 3840] - - [639, 8709.5] + - [745, 8709.5] - - [128, 1024, 1, 1024] - - [662, 2577.25] + - [768, 2577.25] - - [2048, 1024, 1, 11264] - - [557, 9339.06] + - [663, 9339.06] - - [2048, 1024, 1, 384] - - [646, 8210.81] + - [752, 8210.81] - - [1024, 256, 1, 2048] - - [667, 5755.58] + - [773, 5755.58] - - [2048, 1024, 1, 3840] - - [554, 9288.96] + - [660, 9288.96] - - [4096, 256, 1, 8320] - - [660, 8812.38] + - [766, 8812.38] - - [2048, 256, 1, 3840] - - [636, 7857.05] + - [742, 7857.05] - - [64, 256, 1, 512] - - [634, 336.182] + - [740, 336.182] - - [4096, 512, 1, 1280] - - [554, 8993.52] + - [660, 8993.52] - - [512, 256, 1, 1280] - - [586, 2996.03] + - [692, 2996.03] - - [1024, 512, 1, 7680] - - [644, 8041.59] + - [750, 8041.59] - - [4096, 1024, 1, 1152] - - [552, 9368.48] + - [658, 9368.48] - - [256, 200, 1, 512] - - [586, 993.07] + - [692, 993.07] - - [256, 1024, 1, 2048] - - [663, 4759.59] + - [769, 4759.59] - - [2048, 200, 1, 10240] - - [655, 6329.03] + - [761, 6329.03] - - [2048, 512, 1, 5120] - - [656, 8732.56] + - [762, 8732.56] - - [2048, 1024, 1, 1408] - - [554, 9006.9] + - [660, 9006.9] - - [512, 1024, 1, 512] - - [644, 6528.2] + - [750, 6528.2] - - [1024, 200, 1, 11264] - - [611, 5194.82] + - [717, 5194.82] - - [512, 1024, 1, 1024] - - [597, 6337.1] + - [703, 6337.1] - - [2048, 512, 1, 32] - - [563, 2777.78] + - [669, 2777.78] - - [4096, 256, 1, 2560] - - [646, 8621.49] + - [752, 8621.49] - - [4096, 256, 1, 64] - - [580, 4194.4] + - [686, 4194.4] - - [32, 1024, 1, 1024] - - [581, 778.264] + - [687, 778.264] - - [2048, 200, 1, 768] - - [655, 5507.33] + - [761, 5507.33] - - [512, 512, 1, 2048] - - [603, 5338.91] + - [709, 5338.91] - - [2048, 512, 1, 2560] - - [657, 8643.69] + - [763, 8643.69] - - [512, 256, 1, 512] - - [636, 2542.1] + - [742, 2542.1] - - [1024, 200, 1, 7680] - - [611, 5047.8] + - [717, 5047.8] - - [4096, 512, 1, 896] - - [552, 8856.85] + - [658, 8856.85] - - [4096, 1024, 1, 3072] - - [552, 9492.17] + - [658, 9492.17] - - [4096, 200, 1, 13312] - - [544, 6900.73] + - [650, 6900.73] - - [2048, 512, 1, 7168] - - [645, 8788.1] + - [751, 8788.1] - - [2048, 1024, 1, 2816] - - [557, 9229.88] + - [663, 9229.88] - - [2048, 512, 1, 128] - - [575, 5630.04] + - [681, 5630.04] - - [1024, 256, 1, 8192] - - [667, 6203.83] + - [773, 6203.83] - - [4096, 1024, 1, 1792] - - [552, 9510.42] + - [658, 9510.42] - - [1024, 200, 1, 6656] - - [636, 5002.85] + - [742, 5002.85] - - [1024, 1024, 1, 1024] - - [573, 8095.26] + - [679, 8095.26] - - [4096, 200, 1, 2304] - - [657, 6754.45] + - [763, 6754.45] - - [4096, 512, 1, 1152] - - [552, 8974.54] + - [658, 8974.54] - - [512, 200, 1, 1024] - - [634, 2233.01] + - [740, 2233.01] - - [1024, 256, 1, 3840] - - [655, 6244.72] + - [761, 6244.72] - - [512, 512, 1, 768] - - [644, 5331.84] + - [750, 5331.84] - - [2048, 512, 1, 4096] - - [654, 8621.76] + - [760, 8621.76] - - [2048, 256, 1, 2560] - - [644, 7770.93] + - [750, 7770.93] - - [2048, 256, 1, 4160] - - [655, 7923.08] + - [761, 7923.08] - - [1024, 256, 1, 64] - - [551, 1705.1] + - [657, 1705.1] - - [4096, 512, 1, 7680] - - [552, 9364.57] + - [658, 9364.57] - - [1024, 512, 1, 1664] - - [655, 7594.24] + - [761, 7594.24] - - [2048, 512, 1, 2080] - - [646, 8570.67] + - [752, 8570.67] - - [2048, 512, 1, 3840] - - [657, 8729.14] + - [763, 8729.14] - - [4096, 1024, 1, 384] - - [552, 8764.86] + - [658, 8764.86] - - [4096, 200, 1, 3072] - - [646, 6772.39] + - [752, 6772.39] - - [1024, 512, 1, 14336] - - [614, 7680.97] + - [720, 7680.97] - - [1024, 200, 1, 1920] - - [636, 4637.08] + - [742, 4637.08] - - [1024, 1024, 1, 1664] - - [650, 8506.49] + - [756, 8506.49] - - [512, 1024, 1, 2304] - - [644, 7775.33] + - [750, 7775.33] - - [2048, 1024, 1, 1792] - - [552, 9123.46] + - [658, 9123.46] - - [32, 200, 1, 512] - - [652, 125.744] + - [758, 125.744] - - [4096, 256, 1, 11264] - - [657, 8822.31] + - [763, 8822.31] - - [4096, 256, 1, 1408] - - [657, 8419.32] + - [763, 8419.32] - - [1024, 256, 1, 7168] - - [644, 6377.54] + - [750, 6377.54] - - [2048, 256, 1, 1152] - - [655, 7401.81] + - [761, 7401.81] - - [256, 256, 1, 512] - - [634, 1314.93] + - [740, 1314.93] - - [1024, 512, 1, 1280] - - [644, 7410.53] + - [750, 7410.53] - - [512, 512, 1, 1792] - - [636, 5931.44] + - [742, 5931.44] - - [2048, 200, 1, 12288] - - [618, 6242.25] + - [724, 6242.25] - - [2048, 200, 1, 1664] - - [655, 5953.75] + - [761, 5953.75] - - [4096, 200, 1, 4608] - - [650, 6853.54] + - [756, 6853.54] - - [512, 1024, 1, 2560] - - [644, 7778.13] + - [750, 7778.13] - - [4096, 200, 1, 384] - - [637, 5765.73] + - [743, 5765.73] - - [128, 512, 1, 512] - - [634, 1302.68] + - [740, 1302.68] - - [1024, 200, 1, 256] - - [638, 2861.93] + - [744, 2861.93] - - [256, 1024, 1, 1024] - - [579, 4522.26] + - [685, 4522.26] - - [2048, 200, 1, 128] - - [644, 3310.0] + - [750, 3310.0] - - [2048, 200, 1, 11264] - - [625, 6168.2] + - [731, 6168.2] - - [1024, 512, 1, 1920] - - [655, 7649.29] + - [761, 7649.29] - - [4096, 256, 1, 1536] - - [650, 8427.33] + - [756, 8427.33] - - [4096, 1024, 1, 3584] - - [552, 9618.0] + - [658, 9618.0] - - [2048, 256, 1, 256] - - [644, 5464.99] + - [750, 5464.99] - - [2048, 1024, 1, 768] - - [552, 8726.87] + - [658, 8726.87] - - [4096, 256, 1, 10240] - - [646, 8790.89] + - [752, 8790.89] - - [2048, 256, 1, 10240] - - [626, 7665.31] + - [732, 7665.31] - - [4096, 200, 1, 14336] - - [660, 6916.18] + - [766, 6916.18] - - [1024, 512, 1, 5120] - - [598, 7420.36] + - [704, 7420.36] - - [1024, 512, 1, 8320] - - [655, 8061.31] + - [761, 8061.31] - - [256, 200, 1, 2048] - - [610, 1916.36] + - [716, 1916.36] - - [1024, 200, 1, 640] - - [638, 3873.39] + - [744, 3873.39] - - [1024, 512, 1, 10240] - - [643, 7526.9] + - [749, 7526.9] - - [1024, 200, 1, 4160] - - [655, 4928.19] + - [761, 4928.19] - - [1024, 200, 1, 5632] - - [636, 4978.66] + - [742, 4978.66] - - [1024, 1024, 1, 2048] - - [591, 7937.28] + - [697, 7937.28] - - [1024, 256, 1, 6656] - - [655, 6373.68] + - [761, 6373.68] - - [2048, 1024, 1, 8320] - - [552, 9333.15] + - [658, 9333.15] - - [1024, 256, 1, 10240] - - [644, 6407.29] + - [750, 6407.29] - - [2048, 256, 1, 2080] - - [644, 7714.58] + - [750, 7714.58] - - [4096, 256, 1, 128] - - [558, 5765.47] + - [664, 5765.47] - - [1024, 256, 1, 768] - - [649, 5210.42] + - [755, 5210.42] - - [2048, 256, 1, 896] - - [655, 7267.46] + - [761, 7267.46] - - [64, 512, 1, 2048] - - [621, 1296.64] + - [727, 1296.64] - - [4096, 512, 1, 2048] - - [554, 9121.25] + - [660, 9121.25] - - [512, 256, 1, 2048] - - [607, 3283.31] + - [713, 3283.31] - - [4096, 256, 1, 16640] - - [639, 8839.88] + - [745, 8839.88] - - [4096, 512, 1, 2560] - - [557, 9222.15] + - [663, 9222.15] - - [1024, 512, 1, 15360] - - [608, 7865.66] + - [714, 7865.66] - - [4096, 1024, 1, 2304] - - [552, 9558.26] + - [658, 9558.26] - - [4096, 200, 1, 1152] - - [657, 6531.93] + - [763, 6531.93] - - [2048, 200, 1, 6144] - - [655, 6277.75] + - [761, 6277.75] - - [1024, 1024, 1, 7680] - - [650, 8799.34] + - [756, 8799.34] - - [2048, 200, 1, 1920] - - [655, 6031.02] + - [761, 6031.02] - - [32, 1024, 1, 2048] - - [629, 1174.98] + - [735, 1174.98] - - [1024, 200, 1, 3584] - - [636, 4880.44] + - [742, 4880.44] - - [4096, 256, 1, 2080] - - [643, 8557.22] + - [749, 8557.22] - - [1024, 1024, 1, 16384] - - [544, 8618.65] + - [650, 8618.65] - - [1024, 256, 1, 1408] - - [655, 5803.54] + - [761, 5803.54] - - [1024, 256, 1, 4096] - - [665, 6037.78] + - [771, 6037.78] - - [2048, 200, 1, 14336] - - [655, 6364.48] + - [761, 6364.48] - - [4096, 512, 1, 5120] - - [554, 9302.05] + - [660, 9302.05] - - [1024, 512, 1, 6144] - - [590, 7469.09] + - [696, 7469.09] - - [1024, 512, 1, 2304] - - [655, 7759.35] + - [761, 7759.35] - - [4096, 200, 1, 4160] - - [639, 6843.22] + - [745, 6843.22] - - [4096, 200, 1, 1536] - - [650, 6628.27] + - [756, 6628.27] - - [4096, 1024, 1, 6144] - - [552, 9593.08] - - - [1280, 384, 1, 64] - - [683, 3196.98] + - [658, 9593.08] - - [256, 64, 1, 1225] - - [684, 1194.77] + - [790, 1194.77] - - [2048, 320, 1, 64] - - [686, 3449.36] - - - [256, 48, 1, 1225] - - [677, 913.498] - - - [2048, 192, 1, 64] - - [676, 2516.68] + - [792, 3449.36] - - [1024, 128, 1, 289] - - [690, 2869.78] - - - [1280, 192, 1, 64] - - [669, 1872.56] - - - [192, 32, 1, 1225] - - [674, 505.906] - - - [1280, 448, 1, 64] - - [670, 3078.97] + - [796, 2869.78] - - [384, 64, 1, 1225] - - [675, 1511.43] + - [781, 1511.43] - - [2048, 384, 1, 64] - - [688, 3836.35] - - - [288, 48, 1, 1225] - - [671, 1032.69] + - [794, 3836.35] - - [64, 80, 1, 5329] - - [687, 888.267] + - [793, 888.267] - - [1024, 384, 1, 289] - - [681, 4291.62] + - [787, 4291.62] - - [2048, 448, 1, 64] - - [680, 3783.62] - - - [1280, 320, 1, 64] - - [686, 2777.05] - - - [192, 64, 1, 1225] - - [671, 926.997] - - - [384, 192, 1, 1225] - - [682, 2560.1] - - - [1536, 256, 1, 64] - - [689, 2621.54] - - - [192, 48, 1, 1225] - - [674, 698.714] - - - [768, 128, 1, 289] - - [691, 2291.22] - - - [1024, 256, 1, 289] - - [689, 4064.46] + - [786, 3783.62] - - [768, 192, 1, 289] - - [685, 2690.43] - - - [1536, 384, 1, 64] - - [672, 3145.83] + - [791, 2690.43] - - [288, 64, 1, 1225] - - [674, 1142.77] - - - [1024, 192, 1, 289] - - [679, 3243.23] + - [780, 1142.77] - - [384, 96, 1, 1225] - - [692, 1844.81] - - - [160, 64, 1, 5329] - - [678, 1564.58] - - - [768, 160, 1, 289] - - [673, 2386.68] + - [798, 1844.81] - - [1024, 3392, 1, 4096] - - [718, 8503.02] + - [824, 8503.02] - - [1024, 3301, 1, 4096] - - [720, 8414.1] + - [826, 8414.1] - - [1024, 3443, 1, 4096] - - [707, 8536.59] + - [813, 8536.59] - - [132, 134, 480, 64] - - [745, 4149.27] + - [851, 4149.27] - - [162, 162, 400, 64] - - [733, 5539.73] + - [839, 5539.73] - - [4096, 3548, 1, 1024] - - [699, 9773.01] + - [805, 9773.01] - - [4096, 2977, 1, 1024] - - [700, 9574.43] + - [806, 9574.43] - - [132, 135, 480, 64] - - [745, 4167.51] + - [851, 4167.51] - - [1024, 2985, 1, 4096] - - [703, 9133.99] + - [809, 9133.99] - - [33708, 3681, 1, 1024] - - [700, 10033.8] + - [806, 10033.8] - - [4096, 3443, 1, 1024] - - [700, 9513.78] + - [806, 9513.78] - - [11, 11, 5456, 64] - - [742, 627.346] + - [848, 627.346] - - [1024, 3400, 1, 4096] - - [721, 8420.02] + - [827, 8420.02] - - [4096, 3995, 1, 1024] - - [699, 9693.87] + - [805, 9693.87] - - [4096, 3190, 1, 1024] - - [699, 9474.84] + - [805, 9474.84] - - [4096, 3594, 1, 1024] - - [700, 9315.83] + - [806, 9315.83] - - [159, 162, 400, 64] - - [732, 5429.98] + - [838, 5429.98] - - [1024, 3565, 1, 4096] - - [715, 8532.8] + - [821, 8532.8] - - [4096, 3422, 1, 1024] - - [700, 9459.24] + - [806, 9459.24] - - [1024, 3214, 1, 4096] - - [720, 8064.92] + - [826, 8064.92] - - [33708, 3584, 1, 1024] - - [701, 10129.0] + - [807, 10129.0] - - [33708, 3640, 1, 1024] - - [698, 9919.22] + - [804, 9919.22] - - [4096, 3263, 1, 1024] - - [698, 9699.35] + - [804, 9699.35] - - [4096, 3296, 1, 1024] - - [698, 9780.8] + - [804, 9780.8] - - [1024, 3557, 1, 4096] - - [719, 8526.89] + - [825, 8526.89] - - [4096, 3463, 1, 1024] - - [698, 9578.13] + - [804, 9578.13] - - [4096, 3528, 1, 1024] - - [698, 9739.92] + - [804, 9739.92] - - [14, 14, 4368, 64] - - [730, 991.276] + - [836, 991.276] - - [4096, 3226, 1, 1024] - - [698, 9587.19] + - [804, 9587.19] - - [4096, 3439, 1, 1024] - - [701, 9499.72] + - [807, 9499.72] - - [1024, 3523, 1, 4096] - - [721, 8393.58] + - [827, 8393.58] - - [1024, 3098, 1, 4096] - - [727, 7882.87] + - [833, 7882.87] - - [4096, 3121, 1, 1024] - - [698, 9296.23] + - [804, 9296.23] - - [33708, 3894, 1, 1024] - - [699, 9952.27] + - [805, 9952.27] - - [1024, 3548, 1, 4096] - - [705, 8432.45] + - [811, 8432.45] - - [1024, 3451, 1, 4096] - - [718, 8456.44] + - [824, 8456.44] - - [4096, 3353, 1, 1024] - - [700, 9289.08] + - [806, 9289.08] - - [4096, 3402, 1, 1024] - - [700, 9406.44] + - [806, 9406.44] - - [4096, 3939, 1, 1024] - - [698, 9549.59] + - [804, 9549.59] - - [133, 133, 480, 64] - - [745, 4124.31] + - [851, 4124.31] - - [1024, 3559, 1, 4096] - - [720, 8587.04] + - [826, 8587.04] - - [1024, 2977, 1, 4096] - - [703, 9084.59] + - [809, 9084.59] - - [1024, 3478, 1, 4096] - - [714, 8342.85] + - [820, 8342.85] - - [134, 134, 480, 64] - - [747, 4204.43] + - [853, 4204.43] - - [1024, 3368, 1, 4096] - - [720, 8277.43] + - [826, 8277.43] - - [4096, 4012, 1, 1024] - - [700, 9726.57] + - [806, 9726.57] - - [4096, 3486, 1, 1024] - - [698, 9639.71] + - [804, 9639.71] - - [1024, 3479, 1, 4096] - - [708, 8420.37] + - [814, 8420.37] - - [1024, 3505, 1, 4096] - - [720, 8310.66] + - [826, 8310.66] - - [4096, 3381, 1, 1024] - - [701, 9357.75] + - [807, 9357.75] - - [4096, 3430, 1, 1024] - - [698, 9482.36] + - [804, 9482.36] - - [1024, 3554, 1, 4096] - - [720, 8592.38] + - [826, 8592.38] - - [4096, 3271, 1, 1024] - - [698, 9715.41] + - [804, 9715.41] - - [1024, 3063, 1, 4096] - - [702, 9388.56] + - [808, 9388.56] - - [1024, 3209, 1, 4096] - - [720, 8212.74] + - [826, 8212.74] - - [4096, 3503, 1, 1024] - - [700, 9680.59] + - [806, 9680.59] - - [4096, 3344, 1, 1024] - - [698, 9268.55] + - [804, 9268.55] - - [1024, 3147, 1, 4096] - - [721, 8037.2] + - [827, 8037.2] - - [1024, 3322, 1, 4096] - - [719, 8356.32] + - [825, 8356.32] - - [1024, 3341, 1, 4096] - - [720, 8316.33] + - [826, 8316.33] - - [1024, 3516, 1, 4096] - - [702, 8397.12] + - [808, 8397.12] - - [102, 101, 624, 64] - - [733, 4709.59] + - [839, 4709.59] - - [1024, 3454, 1, 4096] - - [719, 8425.6] + - [825, 8425.6] - - [4096, 3969, 1, 1024] - - [700, 9640.15] + - [806, 9640.15] - - [4096, 3466, 1, 1024] - - [700, 9576.83] + - [806, 9576.83] - - [1024, 3999, 1, 1024] - - [703, 9207.15] + - [809, 9207.15] - - [1024, 4032, 1, 1024] - - [704, 9294.56] + - [810, 9294.56] - - [1024, 3403, 1, 4096] - - [718, 8357.97] + - [824, 8357.97] - - [4096, 3361, 1, 1024] - - [700, 9308.78] + - [806, 9308.78] - - [1024, 3527, 1, 4096] - - [719, 8512.19] + - [825, 8512.19] - - [1024, 3822, 1, 4096] - - [703, 8991.13] + - [809, 8991.13] - - [4096, 3315, 1, 1024] - - [698, 9834.96] + - [804, 9834.96] - - [232, 232, 272, 64] - - [732, 6481.62] + - [838, 6481.62] - - [1024, 3336, 1, 4096] - - [721, 8295.61] + - [827, 8295.61] - - [228, 232, 272, 64] - - [733, 6327.85] + - [839, 6327.85] - - [4096, 3547, 1, 1024] - - [698, 9781.56] + - [804, 9781.56] - - [4096, 3340, 1, 1024] - - [700, 9269.72] + - [806, 9269.72] - - [1024, 3906, 1, 1024] - - [704, 9018.38] + - [810, 9018.38] - - [1024, 3295, 1, 4096] - - [718, 8194.83] + - [824, 8194.83] - - [4096, 3294, 1, 1024] - - [701, 9762.16] + - [807, 9762.16] - - [33708, 3968, 1, 1024] - - [701, 10147.8] + - [807, 10147.8] - - [1024, 3473, 1, 4096] - - [707, 8318.68] + - [813, 8318.68] - - [1024, 3072, 1, 4096] - - [704, 9370.13] + - [810, 9370.13] - - [4096, 3189, 1, 1024] - - [698, 9470.26] + - [804, 9470.26] - - [4096, 3494, 1, 1024] - - [698, 9661.32] + - [804, 9661.32] - - [1024, 3522, 1, 4096] - - [721, 8459.23] + - [827, 8459.23] - - [33708, 3944, 1, 1024] - - [701, 10060.2] + - [807, 10060.2] - - [135, 135, 480, 64] - - [746, 4257.03] + - [852, 4257.03] - - [4096, 3421, 1, 1024] - - [698, 9456.98] + - [804, 9456.98] - - [32, 32, 1984, 64] - - [743, 3436.24] + - [849, 3436.24] - - [4096, 3311, 1, 1024] - - [698, 9810.88] + - [804, 9810.88] - - [1024, 3990, 1, 1024] - - [705, 9197.74] + - [811, 9197.74] - - [1024, 3290, 1, 4096] - - [718, 8229.63] + - [824, 8229.63] - - [4096, 3565, 1, 1024] - - [699, 9824.48] + - [805, 9824.48] - - [1024, 3484, 1, 4096] - - [708, 8575.38] + - [814, 8575.38] - - [4096, 3384, 1, 1024] - - [698, 9366.54] + - [804, 9366.54] - - [1024, 3422, 1, 4096] - - [718, 8484.12] + - [824, 8484.12] - - [4096, 3681, 1, 1024] - - [699, 9520.16] + - [805, 9520.16] - - [1024, 3584, 1, 1024] - - [725, 8583.37] + - [831, 8583.37] - - [4096, 4050, 1, 1024] - - [700, 9807.35] + - [806, 9807.35] - - [1024, 3996, 1, 4096] - - [701, 9181.7] + - [807, 9181.7] - - [4096, 3169, 1, 1024] - - [699, 9411.4] + - [805, 9411.4] - - [4096, 3538, 1, 1024] - - [699, 9765.99] + - [805, 9765.99] - - [1024, 3495, 1, 4096] - - [705, 8295.95] + - [811, 8295.95] - - [4096, 3401, 1, 1024] - - [698, 9402.68] + - [804, 9402.68] - - [1024, 3560, 1, 4096] - - [719, 8513.45] + - [825, 8513.45] - - [133, 135, 480, 64] - - [746, 4199.08] + - [852, 4199.08] - - [1024, 3263, 1, 4096] - - [720, 8172.23] + - [826, 8172.23] - - [1024, 3870, 1, 4096] - - [700, 8996.27] + - [806, 8996.27] - - [4096, 3555, 1, 1024] - - [701, 9811.88] + - [807, 9811.88] - - [4096, 3412, 1, 1024] - - [698, 9432.09] + - [804, 9432.09] - - [101, 101, 624, 64] - - [732, 4667.69] + - [838, 4667.69] - - [1024, 3296, 1, 4096] - - [719, 8350.61] + - [825, 8350.61] - - [1024, 3379, 1, 4096] - - [721, 8432.94] + - [827, 8432.94] - - [4096, 3302, 1, 1024] - - [698, 9796.39] + - [804, 9796.39] - - [1024, 3490, 1, 4096] - - [718, 8538.44] + - [824, 8538.44] - - [1024, 3428, 1, 4096] - - [719, 8531.67] + - [825, 8531.67] - - [1024, 3976, 1, 4096] - - [700, 9327.87] + - [806, 9327.87] - - [4096, 3485, 1, 1024] - - [698, 9628.82] + - [804, 9628.82] - - [4096, 3534, 1, 1024] - - [698, 9755.97] + - [804, 9755.97] - - [1024, 3064, 1, 4096] - - [704, 9196.98] + - [810, 9196.98] - - [4096, 3216, 1, 1024] - - [700, 9563.44] + - [806, 9563.44] - - [1024, 3450, 1, 4096] - - [728, 8519.29] + - [834, 8519.29] - - [1024, 3533, 1, 4096] - - [719, 8495.77] + - [825, 8495.77] - - [1024, 4030, 1, 1024] - - [704, 9304.68] + - [810, 9304.68] - - [1024, 3311, 1, 4096] - - [719, 8278.6] + - [825, 8278.6] - - [1024, 3468, 1, 4096] - - [710, 8564.55] + - [816, 8564.55] - - [23, 23, 2720, 64] - - [734, 2311.55] + - [840, 2311.55] - - [4096, 3359, 1, 1024] - - [700, 9309.15] + - [806, 9309.15] - - [4096, 3392, 1, 1024] - - [700, 9388.19] + - [806, 9388.19] - - [1024, 3925, 1, 1024] - - [702, 9006.72] + - [808, 9006.72] - - [4096, 3233, 1, 1024] - - [698, 9603.64] + - [804, 9603.64] - - [4096, 3956, 1, 1024] - - [699, 9581.94] + - [805, 9581.94] - - [1024, 3463, 1, 4096] - - [720, 8293.97] + - [826, 8293.97] - - [1024, 3126, 1, 4096] - - [719, 7978.13] + - [825, 7978.13] - - [1024, 3363, 1, 4096] - - [712, 8267.47] + - [818, 8267.47] - - [4096, 3465, 1, 1024] - - [698, 9590.74] + - [804, 9590.74] - - [33708, 3996, 1, 1024] - - [699, 9899.99] + - [805, 9899.99] - - [1024, 3231, 1, 4096] - - [720, 8231.68] + - [826, 8231.68] - - [33708, 3978, 1, 1024] - - [699, 9853.64] + - [805, 9853.64] - - [4096, 3476, 1, 1024] - - [698, 9616.62] + - [804, 9616.62] - - [85, 85, 752, 64] - - [730, 4240.65] + - [836, 4240.65] - - [4096, 3339, 1, 1024] - - [700, 9249.81] + - [806, 9249.81] - - [4096, 3452, 1, 1024] - - [698, 9534.13] + - [804, 9534.13] - - [1024, 3396, 1, 4096] - - [719, 8451.23] + - [825, 8451.23] - - [4096, 3293, 1, 1024] - - [700, 9775.22] + - [806, 9775.22] - - [54, 54, 1184, 64] - - [732, 4153.54] + - [838, 4153.54] - - [1024, 3432, 1, 4096] - - [713, 8345.53] + - [819, 8345.53] - - [4096, 3493, 1, 1024] - - [701, 9649.9] + - [807, 9649.9] - - [4096, 3350, 1, 1024] - - [700, 9273.91] + - [806, 9273.91] - - [1024, 3079, 1, 4096] - - [728, 7775.66] + - [834, 7775.66] - - [1024, 3101, 1, 4096] - - [728, 7847.85] + - [834, 7847.85] - - [33708, 3939, 1, 1024] - - [701, 10054.4] + - [807, 10054.4] - - [4096, 3256, 1, 1024] - - [700, 9681.83] + - [806, 9681.83] - - [1024, 3439, 1, 4096] - - [719, 8531.11] + - [825, 8531.11] - - [1024, 3510, 1, 4096] - - [718, 8422.31] + - [824, 8422.31] - - [4096, 3900, 1, 1024] - - [699, 9468.61] + - [805, 9468.61] - - [1024, 3470, 1, 4096] - - [720, 8507.77] + - [826, 8507.77] - - [4096, 3456, 1, 1024] - - [700, 9577.46] + - [806, 9577.46] - - [4096, 3014, 1, 1024] - - [699, 9666.15] + - [805, 9666.15] - - [4096, 3367, 1, 1024] - - [701, 9328.36] + - [807, 9328.36] - - [4096, 3432, 1, 1024] - - [698, 9480.88] + - [804, 9480.88] - - [33708, 4026, 1, 1024] - - [701, 9972.83] + - [807, 9972.83] - - [4096, 3273, 1, 1024] - - [698, 9716.95] + - [804, 9716.95] - - [4096, 3130, 1, 1024] - - [698, 9311.4] + - [804, 9311.4] - - [1024, 3496, 1, 4096] - - [709, 8434.65] + - [815, 8434.65] - - [1024, 3995, 1, 4096] - - [694, 9157.73] + - [800, 9157.73] - - [1024, 3939, 1, 4096] - - [702, 9059.86] + - [808, 9059.86] - - [1024, 3121, 1, 4096] - - [726, 7963.43] + - [832, 7963.43] - - [1024, 3232, 1, 4096] - - [720, 8061.09] + - [826, 8061.09] - - [4096, 3147, 1, 1024] - - [700, 9364.63] + - [806, 9364.63] - - [4096, 3516, 1, 1024] - - [698, 9708.84] + - [804, 9708.84] - - [1024, 3969, 1, 1024] - - [704, 9168.68] + - [810, 9168.68] - - [1024, 3364, 1, 4096] - - [708, 8363.65] + - [814, 8363.65] - - [4096, 3411, 1, 1024] - - [701, 9442.77] + - [807, 9442.77] - - [147, 147, 432, 64] - - [745, 4843.21] + - [851, 4843.21] - - [4096, 3301, 1, 1024] - - [700, 9783.46] + - [806, 9783.46] - - [112, 111, 576, 64] - - [732, 5627.47] + - [838, 5627.47] - - [1024, 3513, 1, 4096] - - [719, 8725.41] + - [825, 8725.41] - - [1024, 3469, 1, 4096] - - [699, 8183.11] + - [805, 8183.11] - - [1024, 3095, 1, 4096] - - [720, 7887.87] + - [826, 7887.87] - - [4096, 3533, 1, 1024] - - [699, 9755.27] + - [805, 9755.27] - - [4096, 3390, 1, 1024] - - [698, 9377.21] + - [804, 9377.21] - - [4096, 3582, 1, 1024] - - [698, 9874.96] + - [804, 9874.96] - - [1024, 3956, 1, 1024] - - [704, 9058.82] + - [810, 9058.82] - - [4096, 3585, 1, 1024] - - [700, 9289.75] + - [806, 9289.75] - - [4096, 3231, 1, 1024] - - [699, 9597.15] + - [805, 9597.15] - - [1024, 3205, 1, 4096] - - [718, 8073.25] + - [824, 8073.25] - - [4096, 3496, 1, 1024] - - [699, 9668.38] + - [805, 9668.38] - - [1024, 3143, 1, 4096] - - [718, 8031.68] + - [824, 8031.68] - - [1024, 3318, 1, 4096] - - [715, 8261.43] + - [821, 8261.43] - - [1024, 3353, 1, 4096] - - [719, 8414.92] + - [825, 8414.92] - - [1024, 3464, 1, 4096] - - [718, 8310.03] + - [824, 8310.03] - - [4096, 2736, 1, 1024] - - [700, 9563.12] + - [806, 9563.12] - - [1024, 3402, 1, 4096] - - [715, 8413.84] + - [821, 8413.84] - - [4096, 3138, 1, 1024] - - [700, 9342.09] + - [806, 9342.09] - - [1024, 3860, 1, 4096] - - [703, 9008.57] + - [809, 9008.57] - - [148, 148, 432, 64] - - [745, 4915.7] + - [851, 4915.7] - - [1024, 3539, 1, 4096] - - [715, 8449.36] + - [821, 8449.36] - - [4096, 3211, 1, 1024] - - [700, 9551.28] + - [806, 9551.28] - - [1024, 3332, 1, 4096] - - [708, 8295.11] + - [814, 8295.11] - - [1024, 3466, 1, 4096] - - [719, 8339.25] + - [825, 8339.25] - - [4096, 3475, 1, 1024] - - [698, 9612.33] + - [804, 9612.33] - - [4096, 3524, 1, 1024] - - [701, 9722.74] + - [807, 9722.74] - - [4096, 2985, 1, 1024] - - [701, 9591.33] + - [807, 9591.33] - - [4096, 3222, 1, 1024] - - [698, 9577.48] + - [804, 9577.48] - - [4096, 3451, 1, 1024] - - [700, 9541.42] + - [806, 9541.42] - - [1024, 3181, 1, 4096] - - [718, 8118.89] + - [824, 8118.89] - - [1024, 3640, 1, 4096] - - [703, 8617.11] + - [809, 8617.11] - - [1024, 3375, 1, 4096] - - [707, 8419.75] + - [813, 8419.75] - - [1024, 3550, 1, 4096] - - [720, 8512.83] + - [826, 8512.83] - - [1024, 4020, 1, 1024] - - [704, 9266.9] + - [810, 9266.9] - - [1024, 3840, 1, 4096] - - [703, 8983.49] + - [809, 8983.49] - - [4096, 3349, 1, 1024] - - [698, 9279.96] + - [804, 9279.96] - - [4096, 3398, 1, 1024] - - [699, 9402.32] + - [805, 9402.32] - - [33708, 3976, 1, 1024] - - [700, 9849.54] + - [806, 9849.54] - - [1024, 2917, 1, 4096] - - [705, 8936.87] + - [811, 8936.87] - - [33708, 3910, 1, 1024] - - [698, 9983.35] + - [804, 9983.35] - - [4096, 3860, 1, 1024] - - [699, 9377.58] + - [805, 9377.58] - - [4096, 3304, 1, 1024] - - [701, 9798.44] + - [807, 9798.44] - - [1024, 3286, 1, 4096] - - [706, 8167.41] + - [812, 8167.41] - - [1024, 3460, 1, 4096] - - [716, 8539.56] + - [822, 8539.56] - - [1024, 4026, 1, 4096] - - [702, 9305.68] + - [808, 9305.68] - - [4096, 3471, 1, 1024] - - [700, 9596.71] + - [806, 9596.71] - - [193, 193, 320, 64] - - [748, 4758.46] + - [854, 4758.46] - - [1024, 3894, 1, 1024] - - [702, 8979.6] + - [808, 8979.6] - - [65, 65, 992, 64] - - [744, 2565.49] + - [850, 2565.49] - - [1024, 3506, 1, 4096] - - [716, 8593.22] + - [822, 8593.22] - - [35, 35, 1808, 64] - - [738, 2129.72] + - [844, 2129.72] - - [1024, 4000, 1, 1024] - - [702, 9204.6] + - [808, 9204.6] - - [1024, 3900, 1, 4096] - - [698, 9050.36] + - [804, 9050.36] - - [1024, 3445, 1, 4096] - - [721, 8551.65] + - [827, 8551.65] - - [4096, 3442, 1, 1024] - - [699, 9505.0] + - [805, 9505.0] - - [1024, 3358, 1, 4096] - - [720, 8437.16] + - [826, 8437.16] - - [13, 13, 4672, 64] - - [731, 860.665] + - [837, 860.665] - - [1024, 3211, 1, 4096] - - [724, 8085.25] + - [830, 8085.25] - - [4096, 3515, 1, 1024] - - [700, 9715.29] + - [806, 9715.29] - - [1024, 3564, 1, 4096] - - [706, 8760.37] + - [812, 8760.37] - - [4096, 3057, 1, 1024] - - [700, 9804.05] + - [806, 9804.05] - - [1024, 3343, 1, 4096] - - [718, 8363.8] + - [824, 8363.8] - - [4096, 3262, 1, 1024] - - [699, 9686.49] + - [805, 9686.49] - - [1024, 3518, 1, 4096] - - [718, 8455.05] + - [824, 8455.05] - - [77, 77, 816, 64] - - [737, 3505.94] + - [843, 3505.94] - - [33708, 3876, 1, 1024] - - [699, 9895.95] + - [805, 9895.95] - - [4096, 3462, 1, 1024] - - [700, 9570.31] + - [806, 9570.31] - - [1024, 3265, 1, 4096] - - [718, 8322.75] + - [824, 8322.75] - - [4096, 3389, 1, 1024] - - [699, 9382.86] + - [805, 9382.86] - - [4096, 3438, 1, 1024] - - [700, 9503.47] + - [806, 9503.47] - - [1024, 3955, 1, 1024] - - [702, 9064.45] + - [808, 9064.45] - - [1024, 3545, 1, 4096] - - [721, 8652.41] + - [827, 8652.41] - - [1024, 3144, 1, 4096] - - [721, 8060.55] + - [827, 8060.55] - - [1024, 3417, 1, 4096] - - [719, 8505.91] + - [825, 8505.91] - - [4096, 3543, 1, 1024] - - [698, 9775.67] + - [804, 9775.67] - - [4096, 3352, 1, 1024] - - [700, 9282.87] + - [806, 9282.87] - - [33708, 3975, 1, 1024] - - [701, 9849.49] + - [807, 9849.49] - - [148, 147, 432, 64] - - [745, 4876.15] + - [851, 4876.15] - - [4096, 3137, 1, 1024] - - [698, 9330.63] + - [804, 9330.63] - - [4096, 3506, 1, 1024] - - [701, 9682.76] + - [807, 9682.76] - - [1024, 3975, 1, 1024] - - [704, 9164.77] + - [810, 9164.77] - - [1024, 3859, 1, 4096] - - [702, 8983.84] + - [808, 8983.84] - - [4096, 3369, 1, 1024] - - [700, 9330.45] + - [806, 9330.45] - - [1024, 3434, 1, 4096] - - [718, 8486.98] + - [824, 8486.98] - - [1024, 3292, 1, 4096] - - [718, 8478.96] + - [824, 8478.96] - - [4096, 3523, 1, 1024] - - [698, 9734.83] + - [804, 9734.83] - - [4096, 3380, 1, 1024] - - [700, 9354.49] + - [806, 9354.49] - - [1024, 3408, 1, 4096] - - [721, 8441.03] + - [827, 8441.03] - - [4096, 3221, 1, 1024] - - [700, 9575.59] + - [806, 9575.59] - - [4096, 3270, 1, 1024] - - [700, 9717.95] + - [806, 9717.95] - - [143, 143, 432, 64] - - [746, 4643.45] + - [852, 4643.45] - - [111, 111, 576, 64] - - [738, 5475.04] + - [844, 5475.04] - - [1024, 3303, 1, 4096] - - [720, 8413.07] + - [826, 8413.07] - - [4096, 3502, 1, 1024] - - [700, 9679.87] + - [806, 9679.87] - - [1024, 3222, 1, 4096] - - [720, 8141.88] + - [826, 8141.88] - - [4096, 2505, 1, 1024] - - [698, 9594.95] + - [804, 9594.95] - - [4096, 3397, 1, 1024] - - [698, 9392.61] + - [804, 9392.61] - - [4096, 3562, 1, 1024] - - [698, 9827.58] + - [804, 9827.58] - - [4096, 3095, 1, 1024] - - [700, 9222.45] + - [806, 9222.45] - - [1024, 3226, 1, 4096] - - [716, 8027.03] + - [822, 8027.03] - - [177, 177, 352, 64] - - [733, 6406.96] + - [839, 6406.96] - - [4096, 3360, 1, 1024] - - [699, 9298.15] + - [805, 9298.15] - - [1024, 3942, 1, 1024] - - [704, 9061.59] + - [810, 9061.59] - - [1024, 3298, 1, 4096] - - [721, 8254.36] + - [827, 8254.36] - - [1024, 3381, 1, 4096] - - [720, 8508.81] + - [826, 8508.81] - - [4096, 3314, 1, 1024] - - [700, 9837.56] + - [806, 9837.56] - - [1024, 3492, 1, 4096] - - [708, 8583.39] + - [814, 8583.39] - - [1024, 3430, 1, 4096] - - [708, 8492.71] + - [814, 8492.71] - - [4096, 3977, 1, 1024] - - [700, 9656.45] + - [806, 9656.45] - - [4096, 3546, 1, 1024] - - [698, 9780.35] + - [804, 9780.35] - - [4096, 3640, 1, 1024] - - [698, 9415.51] + - [804, 9415.51] - - [4096, 3441, 1, 1024] - - [699, 9499.24] + - [805, 9499.24] - - [33708, 4059, 1, 1024] - - [701, 10051.9] + - [807, 10051.9] - - [1024, 3978, 1, 1024] - - [702, 9158.8] + - [808, 9158.8] - - [1024, 3376, 1, 4096] - - [720, 8415.44] + - [826, 8415.44] - - [1024, 3482, 1, 4096] - - [721, 8396.62] + - [827, 8396.62] - - [1024, 3563, 1, 4096] - - [704, 8424.18] + - [810, 8424.18] - - [4096, 4020, 1, 1024] - - [701, 9745.96] + - [807, 9745.96] - - [1024, 3271, 1, 4096] - - [719, 8289.68] + - [825, 8289.68] - - [1024, 3291, 1, 4096] - - [719, 8222.71] + - [825, 8222.71] - - [1024, 3431, 1, 4096] - - [714, 8464.4] + - [820, 8464.4] - - [1024, 3481, 1, 4096] - - [720, 8386.5] + - [826, 8386.5] - - [84, 85, 752, 64] - - [735, 4194.85] + - [841, 4194.85] - - [4096, 3461, 1, 1024] - - [698, 9579.67] + - [804, 9579.67] - - [1024, 3574, 1, 4096] - - [721, 8579.8] + - [827, 8579.8] - - [1024, 4059, 1, 1024] - - [702, 9330.54] + - [808, 9330.54] - - [84, 84, 752, 64] - - [742, 4141.46] + - [848, 4141.46] - - [1024, 3421, 1, 4096] - - [721, 8528.42] + - [827, 8528.42] - - [4096, 3224, 1, 1024] - - [700, 9589.95] + - [806, 9589.95] - - [4096, 3437, 1, 1024] - - [700, 9498.2] + - [806, 9498.2] - - [45, 45, 1424, 64] - - [732, 3314.58] + - [838, 3314.58] - - [4096, 3840, 1, 1024] - - [698, 9931.37] + - [804, 9931.37] - - [4096, 3168, 1, 1024] - - [700, 9412.16] + - [806, 9412.16] - - [33708, 3990, 1, 1024] - - [698, 9884.39] + - [804, 9884.39] - - [1024, 3349, 1, 4096] - - [720, 8421.4] + - [826, 8421.4] - - [4096, 3335, 1, 1024] - - [698, 9241.65] + - [804, 9241.65] - - [4096, 3400, 1, 1024] - - [700, 9407.35] + - [806, 9407.35] - - [160, 159, 400, 64] - - [747, 5708.94] + - [853, 5708.94] - - [1024, 3398, 1, 4096] - - [720, 8624.03] + - [826, 8624.03] - - [1024, 3780, 1, 4096] - - [700, 8756.78] + - [806, 8756.78] - - [29, 29, 2176, 64] - - [743, 2963.69] + - [849, 2963.69] - - [4096, 3098, 1, 1024] - - [698, 9229.82] + - [804, 9229.82] - - [1024, 4012, 1, 4096] - - [704, 9422.03] + - [810, 9422.03] - - [4096, 3505, 1, 1024] - - [700, 9687.65] + - [806, 9687.65] - - [4096, 3554, 1, 1024] - - [700, 9812.22] + - [806, 9812.22] - - [4096, 3063, 1, 1024] - - [700, 9825.1] + - [806, 9825.1] - - [1024, 3503, 1, 4096] - - [718, 8404.74] + - [824, 8404.74] - - [1024, 3166, 1, 4096] - - [721, 8084.93] + - [827, 8084.93] - - [1024, 3425, 1, 4096] - - [721, 8537.58] + - [827, 8537.58] - - [1024, 3344, 1, 4096] - - [712, 8351.16] + - [818, 8351.16] - - [4096, 3484, 1, 1024] - - [700, 9635.7] + - [806, 9635.7] - - [1024, 3681, 1, 1024] - - [703, 8457.18] + - [809, 8457.18] - - [1024, 4050, 1, 1024] - - [704, 9326.21] + - [810, 9326.21] - - [4096, 3379, 1, 1024] - - [698, 9356.16] + - [804, 9356.16] - - [4096, 3428, 1, 1024] - - [699, 9472.33] + - [805, 9472.33] - - [12, 12, 5040, 64] - - [737, 741.617] + - [843, 741.617] - - [27, 27, 2336, 64] - - [743, 2757.9] + - [849, 2757.9] - - [1024, 3304, 1, 4096] - - [721, 8317.82] + - [827, 8317.82] - - [1024, 3387, 1, 4096] - - [719, 8460.15] + - [825, 8460.15] - - [4096, 3126, 1, 1024] - - [701, 9308.48] + - [807, 9308.48] - - [1024, 3498, 1, 4096] - - [718, 8485.55] + - [824, 8485.55] - - [1024, 3436, 1, 4096] - - [720, 8397.71] + - [826, 8397.71] - - [4096, 3501, 1, 1024] - - [698, 9681.19] + - [804, 9681.19] - - [4096, 3358, 1, 1024] - - [700, 9304.9] + - [806, 9304.9] - - [4096, 3232, 1, 1024] - - [698, 9607.2] + - [804, 9607.2] - - [1024, 3585, 1, 4096] - - [702, 8510.74] + - [808, 8510.74] - - [4096, 3143, 1, 1024] - - [701, 9355.91] + - [807, 9355.91] - - [4096, 3464, 1, 1024] - - [700, 9585.95] + - [806, 9585.95] - - [1024, 3366, 1, 4096] - - [708, 8275.23] + - [814, 8275.23] - - [4096, 3375, 1, 1024] - - [698, 9342.13] + - [804, 9342.13] - - [4096, 2917, 1, 1024] - - [698, 9372.84] + - [804, 9372.84] - - [4096, 4026, 1, 1024] - - [700, 9759.15] + - [806, 9759.15] - - [49, 49, 1296, 64] - - [739, 3710.02] + - [845, 3710.02] - - [1024, 3277, 1, 4096] - - [719, 8217.1] + - [825, 8217.1] - - [1024, 3103, 1, 4096] - - [720, 7872.67] + - [826, 7872.67] - - [33708, 3995, 1, 1024] - - [700, 9893.08] + - [806, 9893.08] - - [1024, 3297, 1, 4096] - - [719, 8185.82] + - [825, 8185.82] - - [4096, 3545, 1, 1024] - - [700, 9789.43] + - [806, 9789.43] - - [1024, 3399, 1, 4096] - - [719, 8377.18] + - [825, 8377.18] - - [33708, 3796, 1, 1024] - - [699, 10008.0] + - [805, 10008.0] - - [4096, 3292, 1, 1024] - - [700, 9767.28] + - [806, 9767.28] - - [71, 71, 896, 64] - - [734, 3006.25] + - [840, 3006.25] - - [33708, 3859, 1, 1024] - - [701, 9860.37] + - [807, 9860.37] - - [4096, 3566, 1, 1024] - - [700, 9834.47] + - [806, 9834.47] - - [4096, 3894, 1, 1024] - - [698, 9456.67] + - [804, 9456.67] - - [4096, 3492, 1, 1024] - - [698, 9653.24] + - [804, 9653.24] - - [1024, 3977, 1, 1024] - - [704, 9161.33] + - [810, 9161.33] - - [1024, 3272, 1, 4096] - - [721, 8257.09] + - [827, 8257.09] - - [135, 134, 480, 64] - - [745, 4238.39] + - [851, 4238.39] - - [1024, 3355, 1, 4096] - - [719, 8374.64] + - [825, 8374.64] - - [4096, 3419, 1, 1024] - - [701, 9455.44] + - [807, 9455.44] - - [1024, 3404, 1, 4096] - - [720, 8580.28] + - [826, 8580.28] - - [4096, 3999, 1, 1024] - - [700, 9701.78] + - [806, 9701.78] - - [4096, 3166, 1, 1024] - - [698, 9410.48] + - [804, 9410.48] - - [33708, 3840, 1, 1024] - - [701, 10132.9] + - [807, 10132.9] - - [4096, 4032, 1, 1024] - - [701, 9762.86] + - [807, 9762.86] - - [1024, 3573, 1, 4096] - - [719, 8603.4] + - [825, 8603.4] - - [4096, 3366, 1, 1024] - - [701, 9322.63] + - [807, 9322.63] - - [1024, 3541, 1, 4096] - - [721, 8405.9] + - [827, 8405.9] - - [4096, 3207, 1, 1024] - - [698, 9544.25] + - [804, 9544.25] - - [4096, 3272, 1, 1024] - - [700, 9716.73] + - [806, 9716.73] - - [1024, 3334, 1, 4096] - - [718, 8241.39] + - [824, 8241.39] - - [228, 228, 272, 64] - - [733, 6232.45] + - [839, 6232.45] - - [4096, 3183, 1, 1024] - - [700, 9452.44] + - [806, 9452.44] - - [4096, 3536, 1, 1024] - - [699, 9759.44] + - [805, 9759.44] - - [1024, 4005, 1, 1024] - - [703, 9225.83] + - [809, 9225.83] - - [1024, 3245, 1, 4096] - - [720, 8074.31] + - [826, 8074.31] - - [4096, 3447, 1, 1024] - - [699, 9525.84] + - [805, 9525.84] - - [1024, 3183, 1, 4096] - - [719, 8121.62] + - [825, 8121.62] - - [1024, 3361, 1, 4096] - - [721, 8285.86] + - [827, 8285.86] - - [33708, 3870, 1, 1024] - - [699, 9879.35] + - [805, 9879.35] - - [1024, 3321, 1, 4096] - - [720, 8408.67] + - [826, 8408.67] - - [1024, 3968, 1, 1024] - - [702, 9202.05] + - [808, 9202.05] - - [1024, 3486, 1, 4096] - - [716, 8258.89] + - [822, 8258.89] - - [4096, 4005, 1, 1024] - - [700, 9723.98] + - [806, 9723.98] - - [4096, 3410, 1, 1024] - - [701, 9440.5] + - [807, 9440.5] - - [1024, 3944, 1, 1024] - - [704, 9040.82] + - [810, 9040.82] - - [4096, 3300, 1, 1024] - - [699, 9789.9] + - [805, 9789.9] - - [4096, 3579, 1, 1024] - - [701, 9859.44] + - [807, 9859.44] - - [4096, 3483, 1, 1024] - - [701, 9624.31] + - [807, 9624.31] - - [4096, 3532, 1, 1024] - - [700, 9742.76] + - [806, 9742.76] - - [1024, 3140, 1, 4096] - - [720, 7899.65] + - [826, 7899.65] - - [1024, 3372, 1, 4096] - - [718, 8237.07] + - [824, 8237.07] - - [1024, 3224, 1, 4096] - - [721, 8159.13] + - [827, 8159.13] - - [4096, 3230, 1, 1024] - - [700, 9601.25] + - [806, 9601.25] - - [4096, 3427, 1, 1024] - - [700, 9466.57] + - [806, 9466.57] - - [1024, 3796, 1, 1024] - - [704, 8739.78] + - [810, 8739.78] - - [143, 148, 432, 64] - - [745, 4762.0] + - [851, 4762.0] - - [1024, 3616, 1, 4096] - - [703, 8445.89] + - [809, 8445.89] - - [1024, 3315, 1, 4096] - - [720, 8403.21] + - [826, 8403.21] - - [1024, 3476, 1, 4096] - - [718, 8523.68] + - [824, 8523.68] - - [1024, 3509, 1, 4096] - - [718, 8345.05] + - [824, 8345.05] - - [4096, 3357, 1, 1024] - - [700, 9300.16] + - [806, 9300.16] - - [4096, 3406, 1, 1024] - - [700, 9427.44] + - [806, 9427.44] - - [1024, 3558, 1, 4096] - - [719, 8525.78] + - [825, 8525.78] - - [4096, 3593, 1, 1024] - - [700, 9302.2] + - [806, 9302.2] - - [4096, 3247, 1, 1024] - - [700, 9648.5] + - [806, 9648.5] - - [4096, 3088, 1, 1024] - - [700, 9204.21] + - [806, 9204.21] - - [1024, 3213, 1, 4096] - - [718, 8054.31] + - [824, 8054.31] - - [4096, 3511, 1, 1024] - - [698, 9702.7] + - [804, 9702.7] - - [122, 122, 528, 64] - - [739, 6293.39] + - [845, 6293.39] - - [1024, 3365, 1, 4096] - - [715, 8413.62] + - [821, 8413.62] - - [1024, 3504, 1, 4096] - - [717, 8414.46] + - [823, 8414.46] - - [1024, 3442, 1, 4096] - - [720, 8684.0] + - [826, 8684.0] - - [4096, 3474, 1, 1024] - - [698, 9611.6] + - [804, 9611.6] - - [4096, 2984, 1, 1024] - - [699, 9592.82] + - [805, 9592.82] - - [1024, 3876, 1, 4096] - - [702, 9085.95] + - [808, 9085.95] - - [4096, 3337, 1, 1024] - - [700, 9246.22] + - [806, 9246.22] - - [4096, 3450, 1, 1024] - - [700, 9534.63] + - [806, 9534.63] - - [1024, 3547, 1, 4096] - - [720, 8386.73] + - [826, 8386.73] - - [4096, 3291, 1, 1024] - - [699, 9759.34] + - [805, 9759.34] - - [1024, 3340, 1, 4096] - - [719, 8237.97] + - [825, 8237.97] - - [4096, 3491, 1, 1024] - - [700, 9656.59] + - [806, 9656.59] - - [4096, 3348, 1, 1024] - - [700, 9279.15] + - [806, 9279.15] - - [78, 78, 816, 64] - - [740, 3591.09] + - [846, 3591.09] - - [4096, 3968, 1, 1024] - - [701, 9642.19] + - [807, 9642.19] - - [4096, 3906, 1, 1024] - - [701, 9485.37] + - [807, 9485.37] - - [1024, 3477, 1, 4096] - - [708, 8389.2] + - [814, 8389.2] - - [1024, 3397, 1, 4096] - - [718, 8556.88] + - [824, 8556.88] - - [4096, 3165, 1, 1024] - - [699, 9415.52] + - [805, 9415.52] - - [4096, 3470, 1, 1024] - - [698, 9598.5] + - [804, 9598.5] - - [1024, 3526, 1, 4096] - - [718, 8442.15] + - [824, 8442.15] - - [112, 112, 576, 64] - - [733, 5672.6] + - [839, 5672.6] - - [4096, 3365, 1, 1024] - - [698, 9321.83] + - [804, 9321.83] - - [4096, 3319, 1, 1024] - - [698, 9838.48] + - [804, 9838.48] - - [1024, 3401, 1, 4096] - - [720, 8460.86] + - [826, 8460.86] - - [1024, 3294, 1, 4096] - - [719, 8324.63] + - [825, 8324.63] - - [159, 159, 400, 64] - - [735, 5488.51] + - [841, 5488.51] - - [1024, 3472, 1, 4096] - - [713, 8289.77] + - [819, 8289.77] - - [4096, 3328, 1, 1024] - - [699, 9904.35] + - [805, 9904.35] - - [1024, 3861, 1, 1024] - - [704, 8917.63] + - [810, 8917.63] - - [1024, 3910, 1, 1024] - - [702, 9010.16] + - [808, 9010.16] - - [1024, 3410, 1, 4096] - - [720, 8519.63] + - [826, 8519.63] - - [1024, 3395, 1, 4096] - - [718, 8424.35] + - [824, 8424.35] - - [4096, 3282, 1, 1024] - - [698, 9743.67] + - [804, 9743.67] - - [1024, 3751, 1, 1024] - - [705, 8680.39] + - [811, 8680.39] - - [4096, 3145, 1, 1024] - - [700, 9353.37] + - [806, 9353.37] - - [4096, 3514, 1, 1024] - - [700, 9713.04] + - [806, 9713.04] - - [4096, 3944, 1, 1024] - - [700, 9563.92] + - [806, 9563.92] - - [1024, 3515, 1, 4096] - - [719, 8428.13] + - [825, 8428.13] - - [4096, 3409, 1, 1024] - - [699, 9428.77] + - [805, 9428.77] - - [4096, 3564, 1, 1024] - - [698, 9823.79] + - [804, 9823.79] - - [4096, 3299, 1, 1024] - - [700, 9793.03] + - [806, 9793.03] - - [1024, 3057, 1, 4096] - - [696, 9237.85] + - [802, 9237.85] - - [4096, 3531, 1, 1024] - - [698, 9745.64] + - [804, 9745.64] - - [4096, 3388, 1, 1024] - - [700, 9374.65] + - [806, 9374.65] - - [1024, 3189, 1, 4096] - - [720, 8084.6] + - [826, 8084.6] - - [1024, 3300, 1, 4096] - - [720, 8185.13] + - [826, 8185.13] - - [1024, 3720, 1, 4096] - - [699, 8755.11] + - [805, 8755.11] - - [1024, 3383, 1, 4096] - - [713, 8463.47] + - [819, 8463.47] - - [1024, 3494, 1, 4096] - - [720, 8676.57] + - [826, 8676.57] - - [77, 78, 816, 64] - - [736, 3548.26] + - [842, 3548.26] - - [1024, 3448, 1, 4096] - - [718, 8665.78] + - [824, 8665.78] - - [4096, 3542, 1, 1024] - - [698, 9771.88] + - [804, 9771.88] - - [1024, 3488, 1, 4096] - - [718, 8488.39] + - [824, 8488.39] - - [4096, 3405, 1, 1024] - - [700, 9426.16] + - [806, 9426.16] - - [1024, 3262, 1, 4096] - - [720, 8206.97] + - [826, 8206.97] - - [33708, 4005, 1, 1024] - - [701, 9928.16] + - [807, 9928.16] - - [1024, 3594, 1, 4096] - - [705, 8458.57] + - [811, 8458.57] - - [4096, 3103, 1, 1024] - - [701, 9243.14] + - [807, 9243.14] - - [4096, 3136, 1, 1024] - - [700, 9340.9] + - [806, 9340.9] - - [1024, 3378, 1, 4096] - - [721, 8432.45] + - [827, 8432.45] - - [10, 10, 5952, 64] - - [741, 523.353] + - [847, 523.353] - - [7, 7, 8192, 64] - - [741, 260.543] + - [847, 260.543] - - [4096, 3559, 1, 1024] - - [700, 9813.1] + - [806, 9813.1] - - [4096, 3368, 1, 1024] - - [701, 9328.66] + - [807, 9328.66] - - [4096, 3209, 1, 1024] - - [698, 9538.83] + - [804, 9538.83] - - [4096, 3322, 1, 1024] - - [700, 9839.58] + - [806, 9839.58] - - [1024, 3483, 1, 4096] - - [706, 8348.35] + - [812, 8348.35] - - [4096, 3473, 1, 1024] - - [699, 9605.79] + - [805, 9605.79] - - [4096, 3522, 1, 1024] - - [701, 9730.02] + - [807, 9730.02] - - [1024, 3532, 1, 4096] - - [719, 8474.32] + - [825, 8474.32] - - [4096, 3449, 1, 1024] - - [700, 9528.35] + - [806, 9528.35] - - [1024, 3351, 1, 4096] - - [721, 8311.23] + - [827, 8311.23] - - [1024, 3462, 1, 4096] - - [718, 8297.64] + - [824, 8297.64] - - [4096, 3396, 1, 1024] - - [700, 9400.25] + - [806, 9400.25] - - [132, 132, 480, 64] - - [746, 4089.84] + - [852, 4089.84] - - [111, 112, 576, 64] - - [732, 5529.7] + - [838, 5529.7] - - [1024, 3416, 1, 4096] - - [719, 8556.64] + - [825, 8556.64] - - [4096, 3469, 1, 1024] - - [701, 9598.77] + - [807, 9598.77] - - [1024, 3582, 1, 4096] - - [702, 8461.47] + - [808, 8461.47] - - [1024, 3230, 1, 4096] - - [719, 8188.94] + - [825, 8188.94] - - [1024, 3489, 1, 4096] - - [720, 8457.85] + - [826, 8457.85] - - [1024, 3427, 1, 4096] - - [720, 8566.59] + - [826, 8566.59] - - [1024, 3346, 1, 4096] - - [719, 8352.17] + - [825, 8352.17] - - [33708, 3977, 1, 1024] - - [701, 9868.5] + - [807, 9868.5] - - [4096, 3796, 1, 1024] - - [700, 9797.76] + - [806, 9797.76] - - [4096, 3176, 1, 1024] - - [700, 9435.39] + - [806, 9435.39] - - [4096, 3990, 1, 1024] - - [698, 9672.33] + - [804, 9672.33] - - [1024, 3257, 1, 4096] - - [721, 8225.17] + - [827, 8225.17] - - [4096, 3343, 1, 1024] - - [722, 9273.62] + - [828, 9273.62] - - [4096, 3440, 1, 1024] - - [698, 9501.48] + - [804, 9501.48] - - [33708, 4030, 1, 1024] - - [699, 9983.36] + - [805, 9983.36] - - [1024, 3190, 1, 4096] - - [720, 8192.11] + - [826, 8192.11] - - [1024, 3389, 1, 4096] - - [721, 8439.42] + - [827, 8439.42] - - [1024, 3500, 1, 4096] - - [719, 8556.12] + - [825, 8556.12] - - [1024, 3471, 1, 4096] - - [708, 8491.17] + - [814, 8491.17] - - [1024, 3438, 1, 4096] - - [721, 8567.95] + - [827, 8567.95] - - [4096, 3513, 1, 1024] - - [698, 9710.27] + - [804, 9710.27] - - [1024, 3562, 1, 4096] - - [713, 8608.94] + - [819, 8608.94] - - [4096, 3616, 1, 1024] - - [700, 9357.59] + - [806, 9357.59] - - [4096, 3955, 1, 1024] - - [699, 9589.71] + - [805, 9589.71] - - [1024, 3441, 1, 4096] - - [709, 8359.27] + - [815, 8359.27] - - [1024, 3236, 1, 4096] - - [723, 8022.6] + - [829, 8022.6] - - [1024, 3524, 1, 4096] - - [718, 8477.24] + - [824, 8477.24] - - [4096, 3460, 1, 1024] - - [698, 9581.96] + - [804, 9581.96] - - [16, 16, 3840, 64] - - [730, 1270.59] + - [836, 1270.59] - - [92, 93, 688, 64] - - [734, 4962.4] + - [840, 4962.4] - - [1024, 3384, 1, 4096] - - [709, 8409.39] + - [815, 8409.39] - - [4096, 3387, 1, 1024] - - [700, 9379.8] + - [806, 9379.8] - - [4096, 3436, 1, 1024] - - [698, 9491.93] + - [804, 9491.93] - - [4096, 3277, 1, 1024] - - [698, 9717.27] + - [804, 9717.27] - - [1024, 3457, 1, 4096] - - [718, 8279.22] + - [824, 8279.22] - - [1024, 3999, 1, 4096] - - [693, 9231.47] + - [799, 9231.47] - - [1024, 4032, 1, 4096] - - [702, 9443.62] + - [808, 9443.62] - - [4096, 3541, 1, 1024] - - [698, 9773.24] + - [804, 9773.24] - - [4096, 3334, 1, 1024] - - [698, 9242.79] + - [804, 9242.79] - - [1024, 3393, 1, 4096] - - [720, 8376.17] + - [826, 8376.17] - - [17, 17, 3632, 64] - - [742, 1425.77] + - [848, 1425.77] - - [1024, 3411, 1, 4096] - - [708, 8490.97] + - [814, 8490.97] - - [1024, 3822, 1, 1024] - - [705, 8773.44] + - [811, 8773.44] - - [1024, 3593, 1, 4096] - - [705, 8571.25] + - [811, 8571.25] - - [33708, 3822, 1, 1024] - - [699, 10056.8] + - [805, 10056.8] - - [4096, 3504, 1, 1024] - - [701, 9680.29] + - [807, 9680.29] - - [1024, 3163, 1, 4096] - - [720, 8014.43] + - [826, 8014.43] - - [1024, 3357, 1, 4096] - - [721, 8376.04] + - [827, 8376.04] - - [1024, 3906, 1, 4096] - - [702, 9108.22] + - [808, 9108.22] - - [4096, 3415, 1, 1024] - - [698, 9443.87] + - [804, 9443.87] - - [1024, 3406, 1, 4096] - - [721, 8451.64] + - [827, 8451.64] - - [4096, 3321, 1, 1024] - - [700, 9836.62] + - [806, 9836.62] - - [4096, 3584, 1, 1024] - - [701, 9915.93] + - [807, 9915.93] - - [1024, 2736, 1, 4096] - - [704, 8532.93] + - [810, 8532.93] - - [1024, 3110, 1, 4096] - - [721, 7889.29] + - [827, 7889.29] - - [33708, 3999, 1, 1024] - - [701, 9903.33] + - [807, 9903.33] - - [1024, 3093, 1, 4096] - - [719, 7919.35] + - [825, 7919.35] - - [4096, 3378, 1, 1024] - - [701, 9362.3] + - [807, 9362.3] - - [1024, 3543, 1, 4096] - - [715, 8438.16] + - [821, 8438.16] - - [33708, 3925, 1, 1024] - - [700, 10021.6] + - [806, 10021.6] - - [1024, 3352, 1, 4096] - - [721, 8333.82] + - [827, 8333.82] - - [4096, 3780, 1, 1024] - - [698, 9755.02] + - [804, 9755.02] - - [1024, 3990, 1, 4096] - - [695, 9251.02] + - [801, 9251.02] - - [4096, 3500, 1, 1024] - - [698, 9673.83] + - [804, 9673.83] - - [4096, 3996, 1, 1024] - - [699, 9694.5] + - [805, 9694.5] - - [1024, 3247, 1, 4096] - - [724, 8171.58] + - [830, 8171.58] - - [4096, 3395, 1, 1024] - - [700, 9392.04] + - [806, 9392.04] - - [1024, 3169, 1, 4096] - - [719, 7990.24] + - [825, 7990.24] - - [1024, 3088, 1, 4096] - - [719, 7890.36] + - [825, 7890.36] - - [1024, 3584, 1, 4096] - - [721, 8604.2] + - [827, 8604.2] - - [4096, 3093, 1, 1024] - - [700, 9224.88] + - [806, 9224.88] - - [1024, 3538, 1, 4096] - - [702, 8395.74] + - [808, 8395.74] - - [1024, 3996, 1, 1024] - - [703, 9208.33] + - [809, 9208.33] - - [1024, 3581, 1, 4096] - - [715, 8523.24] + - [821, 8523.24] - - [4096, 3374, 1, 1024] - - [700, 9342.81] + - [806, 9342.81] - - [33708, 3751, 1, 1024] - - [700, 9881.99] + - [806, 9881.99] - - [59, 59, 1088, 64] - - [738, 4515.54] + - [844, 4515.54] - - [4096, 3215, 1, 1024] - - [700, 9557.75] + - [806, 9557.75] - - [4096, 3312, 1, 1024] - - [698, 9834.4] + - [804, 9834.4] - - [4096, 3581, 1, 1024] - - [700, 9856.66] + - [806, 9856.66] - - [4096, 3479, 1, 1024] - - [700, 9620.35] + - [806, 9620.35] - - [4096, 3544, 1, 1024] - - [698, 9778.94] + - [804, 9778.94] - - [1024, 3870, 1, 1024] - - [703, 8935.26] + - [809, 8935.26] - - [1024, 3374, 1, 4096] - - [720, 8412.85] + - [826, 8412.85] - - [1024, 2967, 1, 4096] - - [703, 8982.97] + - [809, 8982.97] - - [41, 41, 1552, 64] - - [732, 2805.38] + - [838, 2805.38] - - [4096, 3455, 1, 1024] - - [698, 9538.89] + - [804, 9538.89] - - [4096, 3942, 1, 1024] - - [699, 9554.65] + - [805, 9554.65] - - [1024, 3528, 1, 4096] - - [718, 8438.47] + - [824, 8438.47] - - [4096, 3186, 1, 1024] - - [699, 9468.32] + - [805, 9468.32] - - [1024, 3976, 1, 1024] - - [703, 9167.08] + - [809, 9167.08] - - [1024, 3511, 1, 4096] - - [705, 8335.06] + - [811, 8335.06] - - [4096, 3573, 1, 1024] - - [698, 9855.33] + - [804, 9855.33] - - [4096, 3561, 1, 1024] - - [698, 9831.03] + - [804, 9831.03] - - [4096, 3418, 1, 1024] - - [699, 9450.68] + - [805, 9450.68] - - [33708, 3906, 1, 1024] - - [701, 9973.67] + - [807, 9973.67] - - [4096, 3259, 1, 1024] - - [698, 9685.26] + - [804, 9685.26] - - [4096, 3308, 1, 1024] - - [700, 9792.03] + - [806, 9792.03] - - [1024, 3419, 1, 4096] - - [720, 8514.53] + - [826, 8514.53] - - [1024, 3215, 1, 4096] - - [719, 8137.53] + - [825, 8137.53] - - [1024, 4030, 1, 4096] - - [701, 9290.76] + - [807, 9290.76] - - [4096, 3459, 1, 1024] - - [698, 9567.57] + - [804, 9567.57] - - [1024, 3572, 1, 4096] - - [718, 8501.43] + - [824, 8501.43] - - [1024, 3137, 1, 4096] - - [720, 7930.15] + - [826, 7930.15] - - [1024, 3312, 1, 4096] - - [721, 8378.6] + - [827, 8378.6] - - [1024, 3925, 1, 4096] - - [703, 9255.86] + - [809, 9255.86] - - [1024, 3453, 1, 4096] - - [720, 8630.76] + - [826, 8630.76] - - [4096, 3435, 1, 1024] - - [699, 9495.18] + - [805, 9495.18] - - [1024, 3176, 1, 4096] - - [720, 8087.23] + - [826, 8087.23] - - [1024, 3444, 1, 4096] - - [712, 8528.58] + - [818, 8528.58] - - [4096, 3975, 1, 1024] - - [701, 9645.34] + - [807, 9645.34] - - [4096, 3182, 1, 1024] - - [700, 9448.4] + - [806, 9448.4] - - [1024, 3475, 1, 4096] - - [719, 8404.87] + - [825, 8404.87] - - [9, 9, 6544, 64] - - [734, 425.854] + - [840, 425.854] - - [33708, 3955, 1, 1024] - - [701, 10088.4] + - [807, 10088.4] - - [4096, 3446, 1, 1024] - - [700, 9520.06] + - [806, 9520.06] - - [1024, 3138, 1, 4096] - - [719, 8053.44] + - [825, 8053.44] - - [1024, 3549, 1, 4096] - - [705, 8426.42] + - [811, 8426.42] - - [4096, 3287, 1, 1024] - - [701, 9751.34] + - [807, 9751.34] - - [1024, 3342, 1, 4096] - - [718, 8320.01] + - [824, 8320.01] - - [102, 102, 624, 64] - - [733, 4747.52] + - [839, 4747.52] - - [4096, 3519, 1, 1024] - - [700, 9716.1] + - [806, 9716.1] - - [4096, 3552, 1, 1024] - - [698, 9806.69] + - [804, 9806.69] - - [4096, 3859, 1, 1024] - - [698, 9369.94] + - [804, 9369.94] - - [33708, 3969, 1, 1024] - - [698, 9830.39] + - [804, 9830.39] - - [1024, 3369, 1, 4096] - - [719, 8379.26] + - [825, 8379.26] - - [4096, 3482, 1, 1024] - - [698, 9631.7] + - [804, 9631.7] - - [1024, 3306, 1, 4096] - - [721, 8320.06] + - [827, 8320.06] - - [1024, 3474, 1, 4096] - - [720, 8498.9] + - [826, 8498.9] - - [99, 99, 624, 64] - - [732, 4492.9] + - [838, 4492.9] - - [4096, 3377, 1, 1024] - - [698, 9369.92] + - [804, 9369.92] - - [4096, 3426, 1, 1024] - - [698, 9467.3] + - [804, 9467.3] - - [4096, 2935, 1, 1024] - - [699, 9423.74] + - [805, 9423.74] - - [4096, 3267, 1, 1024] - - [698, 9698.04] + - [804, 9698.04] - - [1024, 3299, 1, 4096] - - [719, 8264.76] + - [825, 8264.76] - - [1024, 3456, 1, 4096] - - [718, 8678.39] + - [824, 8678.39] - - [1024, 3280, 1, 4096] - - [719, 8220.69] + - [825, 8220.69] - - [1024, 3555, 1, 4096] - - [718, 8656.27] + - [824, 8656.27] - - [4096, 3499, 1, 1024] - - [700, 9663.93] + - [806, 9663.93] - - [4096, 3356, 1, 1024] - - [700, 9296.9] + - [806, 9296.9] - - [100, 102, 624, 64] - - [733, 4671.51] + - [839, 4671.51] - - [1024, 3412, 1, 4096] - - [721, 8538.05] + - [827, 8538.05] - - [1024, 2984, 1, 4096] - - [704, 9193.17] + - [810, 9193.17] - - [4096, 3141, 1, 1024] - - [700, 9349.43] + - [806, 9349.43] - - [4096, 3510, 1, 1024] - - [698, 9701.98] + - [804, 9701.98] - - [1024, 3995, 1, 1024] - - [702, 9243.4] + - [808, 9243.4] - - [1024, 3517, 1, 4096] - - [720, 8569.31] + - [826, 8569.31] - - [1024, 3455, 1, 4096] - - [720, 8560.67] + - [826, 8560.67] - - [1024, 3939, 1, 1024] - - [703, 9030.94] + - [809, 9030.94] - - [38, 38, 1680, 64] - - [732, 2459.84] + - [838, 2459.84] - - [1024, 3447, 1, 4096] - - [718, 8610.02] + - [824, 8610.02] - - [1024, 3969, 1, 4096] - - [705, 9097.33] + - [811, 9097.33] - - [4096, 3527, 1, 1024] - - [700, 9743.83] + - [806, 9743.83] - - [4096, 3336, 1, 1024] - - [700, 9248.33] + - [806, 9248.33] - - [1024, 3191, 1, 4096] - - [718, 8104.96] + - [824, 8104.96] - - [1024, 3302, 1, 4096] - - [719, 8245.09] + - [825, 8245.09] - - [1024, 3337, 1, 4096] - - [721, 8254.25] + - [827, 8254.25] - - [4096, 3290, 1, 1024] - - [700, 9759.13] + - [806, 9759.13] - - [1024, 3512, 1, 4096] - - [709, 8641.06] + - [815, 8641.06] - - [1024, 3433, 1, 4096] - - [719, 8444.7] + - [825, 8444.7] - - [4096, 3876, 1, 1024] - - [699, 9420.38] + - [805, 9420.38] - - [4096, 3490, 1, 1024] - - [700, 9641.11] + - [806, 9641.11] - - [4096, 3064, 1, 1024] - - [700, 9820.49] + - [806, 9820.49] - - [1024, 3508, 1, 4096] - - [715, 8442.24] + - [821, 8442.24] - - [1024, 3956, 1, 4096] - - [700, 9128.19] + - [806, 9128.19] - - [4096, 3417, 1, 1024] - - [700, 9448.41] + - [806, 9448.41] - - [1024, 3248, 1, 4096] - - [719, 8006.16] + - [825, 8006.16] - - [1024, 2499, 1, 4096] - - [719, 8155.19] + - [825, 8155.19] - - [1024, 3186, 1, 4096] - - [719, 8093.04] + - [825, 8093.04] - - [1024, 3180, 1, 4096] - - [721, 8097.02] + - [827, 8097.02] - - [4096, 3364, 1, 1024] - - [700, 9318.08] + - [806, 9318.08] - - [4096, 3976, 1, 1024] - - [700, 9654.47] + - [806, 9654.47] - - [4096, 3205, 1, 1024] - - [701, 9538.84] + - [807, 9538.84] - - [4096, 3318, 1, 1024] - - [698, 9838.29] + - [804, 9838.29] - - [1024, 3377, 1, 4096] - - [721, 8445.64] + - [827, 8445.64] - - [1024, 3485, 1, 4096] - - [718, 8368.83] + - [824, 8368.83] - - [4096, 3181, 1, 1024] - - [701, 9458.29] + - [807, 9458.29] - - [4096, 3550, 1, 1024] - - [698, 9783.14] + - [804, 9783.14] - - [1024, 3534, 1, 4096] - - [707, 8684.99] + - [813, 8684.99] - - [1024, 3860, 1, 1024] - - [702, 8923.18] + - [808, 8923.18] - - [160, 160, 400, 64] - - [745, 5797.69] + - [851, 5797.69] - - [4096, 3445, 1, 1024] - - [700, 9511.28] + - [806, 9511.28] - - [1024, 3391, 1, 4096] - - [721, 8541.77] + - [827, 8541.77] - - [1024, 3221, 1, 4096] - - [719, 8055.5] + - [825, 8055.5] - - [4096, 3079, 1, 1024] - - [698, 9181.04] + - [804, 9181.04] - - [4096, 3144, 1, 1024] - - [700, 9351.45] + - [806, 9351.45] - - [1024, 3270, 1, 4096] - - [720, 8367.63] + - [826, 8367.63] - - [1024, 3561, 1, 4096] - - [720, 8426.29] + - [826, 8426.29] - - [1024, 3480, 1, 4096] - - [707, 8465.0] + - [813, 8465.0] - - [4096, 3408, 1, 1024] - - [700, 9420.04] + - [806, 9420.04] - - [1024, 3418, 1, 4096] - - [721, 8481.02] + - [827, 8481.02] - - [4096, 3298, 1, 1024] - - [701, 9788.4] + - [807, 9788.4] - - [1024, 3640, 1, 1024] - - [704, 8435.44] + - [810, 8435.44] - - [1024, 3449, 1, 4096] - - [719, 8590.87] + - [825, 8590.87] - - [1024, 4020, 1, 4096] - - [697, 9168.13] + - [803, 9168.13] - - [4096, 3481, 1, 1024] - - [698, 9627.91] + - [804, 9627.91] - - [4096, 3530, 1, 1024] - - [700, 9734.68] + - [806, 9734.68] - - [1024, 3216, 1, 4096] - - [721, 8014.32] + - [827, 8014.32] - - [1024, 3840, 1, 1024] - - [704, 8908.37] + - [810, 8908.37] - - [1024, 3491, 1, 4096] - - [707, 8410.59] + - [813, 8410.59] - - [1024, 3154, 1, 4096] - - [720, 8095.69] + - [826, 8095.69] - - [4096, 3425, 1, 1024] - - [700, 9474.53] + - [806, 9474.53] - - [1024, 3348, 1, 4096] - - [718, 8202.9] + - [824, 8202.9] - - [1024, 3415, 1, 4096] - - [719, 8597.68] + - [825, 8597.68] - - [1024, 4026, 1, 1024] - - [702, 9279.09] + - [808, 9279.09] - - [1024, 3367, 1, 4096] - - [721, 8335.54] + - [827, 8335.54] - - [1024, 3259, 1, 4096] - - [721, 8285.3] + - [827, 8285.3] - - [1024, 3894, 1, 4096] - - [704, 9040.44] + - [810, 9040.44] - - [4096, 3355, 1, 1024] - - [699, 9291.67] + - [805, 9291.67] - - [4096, 3404, 1, 1024] - - [700, 9410.47] + - [806, 9410.47] - - [1024, 3308, 1, 4096] - - [721, 8336.3] + - [827, 8336.3] - - [4096, 3245, 1, 1024] - - [699, 9641.47] + - [805, 9641.47] - - [1024, 3502, 1, 4096] - - [720, 8375.9] + - [826, 8375.9] - - [33708, 4032, 1, 1024] - - [699, 9988.2] + - [805, 9988.2] - - [8, 8, 7280, 64] - - [736, 339.878] + - [842, 339.878] - - [1024, 3424, 1, 4096] - - [707, 8489.48] + - [813, 8489.48] - - [4096, 3509, 1, 1024] - - [699, 9702.29] + - [805, 9702.29] - - [4096, 3558, 1, 1024] - - [700, 9815.51] + - [806, 9815.51] - - [1024, 3900, 1, 1024] - - [703, 9014.05] + - [809, 9014.05] - - [1024, 2505, 1, 4096] - - [717, 8263.75] + - [823, 8263.75] - - [4096, 3472, 1, 1024] - - [698, 9609.61] + - [804, 9609.61] - - [1024, 3386, 1, 4096] - - [718, 8417.55] + - [824, 8417.55] - - [4096, 3383, 1, 1024] - - [700, 9364.77] + - [806, 9364.77] - - [4096, 3448, 1, 1024] - - [701, 9521.07] + - [807, 9521.07] - - [4096, 4030, 1, 1024] - - [701, 9771.56] + - [807, 9771.56] - - [4096, 3289, 1, 1024] - - [698, 9757.27] + - [804, 9757.27] - - [1024, 3459, 1, 4096] - - [720, 8422.12] + - [826, 8422.12] - - [1024, 2918, 1, 4096] - - [705, 9022.71] + - [811, 9022.71] - - [4096, 3489, 1, 1024] - - [698, 9641.9] + - [804, 9641.9] - - [4096, 3346, 1, 1024] - - [700, 9271.65] + - [806, 9271.65] - - [4096, 3572, 1, 1024] - - [700, 9829.82] + - [806, 9829.82] - - [1024, 3955, 1, 4096] - - [701, 9221.66] + - [807, 9221.66] - - [4096, 3236, 1, 1024] - - [698, 9620.72] + - [804, 9620.72] - - [4096, 3163, 1, 1024] - - [698, 9397.3] + - [804, 9397.3] - - [4096, 3468, 1, 1024] - - [698, 9601.58] + - [804, 9601.58] - - [1024, 3165, 1, 4096] - - [720, 7941.58] + - [826, 7941.58] - - [1024, 3276, 1, 4096] - - [720, 8244.96] + - [826, 8244.96] - - [1024, 3359, 1, 4096] - - [718, 8273.93] + - [824, 8273.93] - - [4096, 3363, 1, 1024] - - [700, 9315.8] + - [806, 9315.8] - - [1024, 3385, 1, 4096] - - [712, 8286.2] + - [818, 8286.2] - - [1024, 3207, 1, 4096] - - [721, 8144.02] + - [827, 8144.02] - - [1024, 3458, 1, 4096] - - [720, 8472.41] + - [826, 8472.41] - - [21, 21, 2976, 64] - - [736, 2083.3] + - [842, 2083.3] - - [4096, 3110, 1, 1024] - - [698, 9260.3] + - [804, 9260.3] - - [4096, 3925, 1, 1024] - - [701, 9526.66] + - [807, 9526.66] - - [1024, 3975, 1, 4096] - - [696, 9133.84] + - [802, 9133.84] - - [4096, 3549, 1, 1024] - - [700, 9793.77] + - [806, 9793.77] - - [4096, 3342, 1, 1024] - - [699, 9264.48] + - [805, 9264.48] - - [1024, 3859, 1, 1024] - - [702, 8933.47] + - [808, 8933.47] - - [1024, 3497, 1, 4096] - - [719, 8526.13] + - [825, 8526.13] - - [4096, 3280, 1, 1024] - - [700, 9733.32] + - [806, 9733.32] - - [1024, 3435, 1, 4096] - - [719, 8489.85] + - [825, 8489.85] - - [1024, 3354, 1, 4096] - - [719, 8248.83] + - [825, 8248.83] - - [4096, 3191, 1, 1024] - - [699, 9475.12] + - [805, 9475.12] - - [4096, 3512, 1, 1024] - - [698, 9701.37] + - [804, 9701.37] - - [1024, 3055, 1, 4096] - - [705, 9264.91] + - [811, 9264.91] - - [4096, 2499, 1, 1024] - - [700, 9574.06] + - [806, 9574.06] - - [1024, 3233, 1, 4096] - - [718, 8101.74] + - [824, 8101.74] - - [4096, 3423, 1, 1024] - - [701, 9463.5] + - [807, 9463.5] - - [1024, 3319, 1, 4096] - - [721, 8413.76] + - [827, 8413.76] - - [4096, 3297, 1, 1024] - - [698, 9782.66] + - [804, 9782.66] - - [4096, 3154, 1, 1024] - - [700, 9381.2] + - [806, 9381.2] - - [1024, 3540, 1, 4096] - - [721, 8507.53] + - [827, 8507.53] - - [1024, 3289, 1, 4096] - - [721, 8233.8] + - [827, 8233.8] - - [4096, 3529, 1, 1024] - - [700, 9741.15] + - [806, 9741.15] - - [4096, 3386, 1, 1024] - - [700, 9372.57] + - [806, 9372.57] - - [4096, 3276, 1, 1024] - - [698, 9713.76] + - [804, 9713.76] - - [1024, 3244, 1, 4096] - - [721, 8146.83] + - [827, 8146.83] - - [1024, 3182, 1, 4096] - - [718, 8115.12] + - [824, 8115.12] - - [4096, 3540, 1, 1024] - - [698, 9768.42] + - [804, 9768.42] - - [1024, 3360, 1, 4096] - - [720, 8353.31] + - [826, 8353.31] - - [1024, 3942, 1, 4096] - - [699, 9143.78] + - [805, 9143.78] - - [4096, 3403, 1, 1024] - - [701, 9412.18] + - [807, 9412.18] - - [4096, 3101, 1, 1024] - - [701, 9239.28] + - [807, 9239.28] - - [4096, 2918, 1, 1024] - - [700, 9373.75] + - [806, 9373.75] - - [1024, 3465, 1, 4096] - - [721, 8288.16] + - [827, 8288.16] - - [33708, 3780, 1, 1024] - - [700, 9971.91] + - [806, 9971.91] - - [4096, 3557, 1, 1024] - - [698, 9814.82] + - [804, 9814.82] - - [4096, 3414, 1, 1024] - - [698, 9436.63] + - [804, 9436.63] - - [1024, 3948, 1, 1024] - - [702, 9073.8] + - [808, 9073.8] - - [4096, 3320, 1, 1024] - - [700, 9834.77] + - [806, 9834.77] - - [4096, 2765, 1, 1024] - - [700, 9667.06] + - [806, 9667.06] - - [1024, 3978, 1, 4096] - - [695, 9109.6] + - [801, 9109.6] - - [4096, 3487, 1, 1024] - - [698, 9644.0] + - [804, 9644.0] - - [4096, 3520, 1, 1024] - - [700, 9728.08] + - [806, 9728.08] - - [1024, 3139, 1, 4096] - - [720, 7940.19] + - [826, 7940.19] - - [1024, 3314, 1, 4096] - - [718, 8294.01] + - [824, 8294.01] - - [4096, 3431, 1, 1024] - - [700, 9482.12] + - [806, 9482.12] - - [123, 122, 528, 64] - - [733, 6325.98] + - [839, 6325.98] - - [1024, 3446, 1, 4096] - - [714, 8468.34] + - [820, 8468.34] - - [1024, 4059, 1, 4096] - - [701, 9370.8] + - [807, 9370.8] - - [99, 102, 624, 64] - - [733, 4624.8] + - [839, 4624.8] - - [4096, 3345, 1, 1024] - - [698, 9271.32] + - [804, 9271.32] - - [4096, 3394, 1, 1024] - - [698, 9398.19] + - [804, 9398.19] - - [1024, 3927, 1, 1024] - - [703, 9041.38] + - [809, 9041.38] - - [4096, 3235, 1, 1024] - - [698, 9619.93] + - [804, 9619.93] - - [1024, 3328, 1, 4096] - - [719, 8406.09] + - [825, 8406.09] - - [33708, 3956, 1, 1024] - - [699, 10100.4] + - [805, 10100.4] - - [4096, 3467, 1, 1024] - - [700, 9586.66] + - [806, 9586.66] - - [1024, 3287, 1, 4096] - - [720, 8273.83] + - [826, 8273.83] - - [4096, 3214, 1, 1024] - - [701, 9557.49] + - [807, 9557.49] - - [4096, 3910, 1, 1024] - - [698, 9490.25] + - [804, 9490.25] - - [1024, 3780, 1, 1024] - - [705, 8706.0] + - [811, 8706.0] - - [1024, 3371, 1, 4096] - - [721, 8248.46] + - [827, 8248.46] - - [4096, 3478, 1, 1024] - - [701, 9619.62] + - [807, 9619.62] - - [1024, 3546, 1, 4096] - - [719, 8456.83] + - [825, 8456.83] - - [1024, 4012, 1, 1024] - - [702, 9253.34] + - [808, 9253.34] - - [4096, 3341, 1, 1024] - - [700, 9260.24] + - [806, 9260.24] - - [4096, 3454, 1, 1024] - - [698, 9533.62] + - [804, 9533.62] - - [4096, 3295, 1, 1024] - - [701, 9772.86] + - [807, 9772.86] - - [4096, 3072, 1, 1024] - - [698, 9887.23] + - [804, 9887.23] - - [1024, 3282, 1, 4096] - - [706, 8112.85] + - [812, 8112.85] - - [33708, 3720, 1, 1024] - - [701, 9818.85] + - [807, 9818.85] - - [1024, 3681, 1, 4096] - - [703, 8639.28] + - [809, 8639.28] - - [1024, 4050, 1, 4096] - - [701, 9291.93] + - [807, 9291.93] - - [4096, 3495, 1, 1024] - - [700, 9660.52] + - [806, 9660.52] - - [4096, 3560, 1, 1024] - - [699, 9813.8] + - [805, 9813.8] - - [4096, 3751, 1, 1024] - - [698, 9684.95] + - [804, 9684.95] - - [1024, 3414, 1, 4096] - - [719, 8555.72] + - [825, 8555.72] - - [33708, 3860, 1, 1024] - - [698, 9856.68] + - [804, 9856.68] - - [1024, 3325, 1, 4096] - - [708, 8261.21] + - [814, 8261.21] - - [4096, 3458, 1, 1024] - - [698, 9570.86] + - [804, 9570.86] - - [4096, 2967, 1, 1024] - - [698, 9544.61] + - [804, 9544.61] - - [1024, 3519, 1, 4096] - - [721, 8413.1] + - [827, 8413.1] - - [4096, 3385, 1, 1024] - - [700, 9367.34] + - [806, 9367.34] - - [4096, 3434, 1, 1024] - - [698, 9488.41] + - [804, 9488.41] - - [1024, 3552, 1, 4096] - - [719, 8456.13] + - [825, 8456.13] - - [4096, 3822, 1, 1024] - - [699, 9849.84] + - [805, 9849.84] - - [1024, 3544, 1, 4096] - - [718, 8494.56] + - [824, 8494.56] - - [4096, 3539, 1, 1024] - - [700, 9763.09] + - [806, 9763.09] - - [4096, 3332, 1, 1024] - - [698, 9232.36] + - [804, 9232.36] - - [1024, 3145, 1, 4096] - - [718, 8098.36] + - [824, 8098.36] - - [1024, 3535, 1, 4096] - - [706, 8592.8] + - [812, 8592.8] - - [1024, 3320, 1, 4096] - - [719, 8419.55] + - [825, 8419.55] - - [33708, 4012, 1, 1024] - - [701, 9940.2] + - [807, 9940.2] - - [4096, 3286, 1, 1024] - - [700, 9747.82] + - [806, 9747.82] - - [1024, 3514, 1, 4096] - - [719, 8653.69] + - [825, 8653.69] - - [93, 93, 688, 64] - - [740, 5005.79] + - [846, 5005.79] - - [1024, 2765, 1, 4096] - - [705, 8636.72] + - [811, 8636.72] - - [1024, 3452, 1, 4096] - - [718, 8445.87] + - [824, 8445.87] - - [4096, 3518, 1, 1024] - - [698, 9722.56] + - [804, 9722.56] - - [1024, 3529, 1, 4096] - - [718, 8444.32] + - [824, 8444.32] - - [4096, 3413, 1, 1024] - - [698, 9436.35] + - [804, 9436.35] - - [33708, 4050, 1, 1024] - - [700, 10026.7] + - [806, 10026.7] - - [1024, 3525, 1, 4096] - - [711, 8488.99] + - [817, 8488.99] - - [4096, 3303, 1, 1024] - - [698, 9791.05] + - [804, 9791.05] - - [1024, 3382, 1, 4096] - - [719, 8483.63] + - [825, 8483.63] - - [1024, 3390, 1, 4096] - - [718, 8552.81] + - [824, 8552.81] - - [1024, 3977, 1, 4096] - - [700, 9053.53] + - [806, 9053.53] - - [1024, 3184, 1, 4096] - - [718, 8008.81] + - [824, 8008.81] - - [4096, 3535, 1, 1024] - - [700, 9760.79] + - [806, 9760.79] - - [4096, 3376, 1, 1024] - - [701, 9341.93] + - [807, 9341.93] - - [4096, 3978, 1, 1024] - - [701, 9642.8] + - [807, 9642.8] - - [1024, 3136, 1, 4096] - - [720, 8085.12] + - [826, 8085.12] - - [1024, 3293, 1, 4096] - - [718, 8300.49] + - [824, 8300.49] - - [4096, 3266, 1, 1024] - - [699, 9691.78] + - [805, 9691.78] - - [1024, 3487, 1, 4096] - - [718, 8383.62] + - [824, 8383.62] - - [1024, 3409, 1, 4096] - - [720, 8493.25] + - [826, 8493.25] - - [4096, 3498, 1, 1024] - - [699, 9672.38] + - [805, 9672.38] - - [1024, 3520, 1, 4096] - - [721, 8488.26] + - [827, 8488.26] - - [1024, 3530, 1, 4096] - - [702, 8409.87] + - [808, 8409.87] - - [4096, 3393, 1, 1024] - - [700, 9395.43] + - [806, 9395.43] - - [4096, 3140, 1, 1024] - - [700, 9338.5] + - [806, 9338.5] - - [1024, 3536, 1, 4096] - - [721, 8642.11] + - [827, 8642.11] - - [1024, 3288, 1, 4096] - - [721, 8229.34] + - [827, 8229.34] - - [1024, 4005, 1, 4096] - - [703, 9271.04] + - [809, 9271.04] - - [1024, 3579, 1, 4096] - - [707, 8844.5] + - [813, 8844.5] - - [4096, 3372, 1, 1024] - - [698, 9339.25] + - [804, 9339.25] - - [1024, 3440, 1, 4096] - - [718, 8466.69] + - [824, 8466.69] - - [4096, 3213, 1, 1024] - - [701, 9558.85] + - [807, 9558.85] - - [123, 123, 528, 64] - - [733, 6333.59] + - [839, 6333.59] - - [100, 100, 624, 64] - - [732, 4584.12] + - [838, 4584.12] - - [1024, 3968, 1, 4096] - - [699, 9237.6] + - [805, 9237.6] - - [4096, 3477, 1, 1024] - - [699, 9618.88] + - [805, 9618.88] - - [4096, 3526, 1, 1024] - - [698, 9735.94] + - [804, 9735.94] - - [1024, 3493, 1, 4096] - - [719, 8355.13] + - [825, 8355.13] - - [1024, 3944, 1, 4096] - - [694, 9065.39] + - [800, 9065.39] - - [4096, 3453, 1, 1024] - - [699, 9533.37] + - [805, 9533.37] - - [1024, 3350, 1, 4096] - - [721, 8448.64] + - [827, 8448.64] - - [4096, 3184, 1, 1024] - - [700, 9447.38] + - [806, 9447.38] - - [1024, 3423, 1, 4096] - - [719, 8465.38] + - [825, 8465.38] - - [4096, 3351, 1, 1024] - - [698, 9282.06] + - [804, 9282.06] - - [4096, 3416, 1, 1024] - - [698, 9446.64] + - [804, 9446.64] - - [1024, 3796, 1, 4096] - - [700, 8820.34] + - [806, 8820.34] - - [4096, 3257, 1, 1024] - - [698, 9671.64] + - [804, 9671.64] - - [4096, 3306, 1, 1024] - - [700, 9795.51] + - [806, 9795.51] - - [33708, 4020, 1, 1024] - - [700, 9961.85] + - [806, 9961.85] - - [19, 19, 3264, 64] - - [730, 1736.09] + - [836, 1736.09] - - [1024, 3426, 1, 4096] - - [718, 8518.61] + - [824, 8518.61] - - [4096, 3457, 1, 1024] - - [698, 9564.56] + - [804, 9564.56] - - [1024, 2935, 1, 4096] - - [703, 9067.79] + - [809, 9067.79] - - [1024, 3046, 1, 4096] - - [703, 9242.97] + - [809, 9242.97] - - [4096, 3433, 1, 1024] - - [700, 9495.65] + - [806, 9495.65] - - [1024, 3256, 1, 4096] - - [721, 8224.23] + - [827, 8224.23] - - [1024, 3531, 1, 4096] - - [718, 8524.19] + - [824, 8524.19] - - [4096, 3180, 1, 1024] - - [698, 9443.53] + - [804, 9443.53] - - [1024, 3388, 1, 4096] - - [720, 8352.82] + - [826, 8352.82] - - [4096, 3444, 1, 1024] - - [701, 9511.03] + - [807, 9511.03] - - [1024, 3501, 1, 4096] - - [708, 8461.12] + - [814, 8461.12] - - [1024, 3266, 1, 4096] - - [706, 8147.44] + - [812, 8147.44] - - [1024, 3267, 1, 4096] - - [721, 8391.49] + - [827, 8391.49] - - [1024, 3461, 1, 4096] - - [705, 8270.29] + - [811, 8270.29] - - [4096, 3870, 1, 1024] - - [700, 9399.69] + - [806, 9399.69] - - [4096, 3517, 1, 1024] - - [698, 9725.43] + - [804, 9725.43] - - [1024, 3566, 1, 4096] - - [721, 8669.76] + - [827, 8669.76] - - [4096, 3574, 1, 1024] - - [698, 9844.63] + - [804, 9844.63] - - [1024, 3876, 1, 1024] - - [703, 8961.74] + - [809, 8961.74] - - [25, 25, 2512, 64] - - [729, 2472.54] + - [835, 2472.54] - - [4096, 3720, 1, 1024] - - [698, 9612.49] + - [804, 9612.49] - - [4096, 3248, 1, 1024] - - [700, 9644.92] + - [806, 9644.92] - - [4096, 4059, 1, 1024] - - [698, 9826.42] + - [804, 9826.42] - - [1024, 3380, 1, 4096] - - [719, 8677.91] + - [825, 8677.91] - - [4096, 3480, 1, 1024] - - [700, 9626.16] + - [806, 9626.16] - - [1024, 3335, 1, 4096] - - [720, 8302.18] + - [826, 8302.18] - - [1024, 3345, 1, 4096] - - [720, 8323.13] + - [826, 8323.13] - - [4096, 3391, 1, 1024] - - [698, 9379.48] + - [804, 9379.48] - - [4096, 3424, 1, 1024] - - [700, 9466.77] + - [806, 9466.77] - - [1024, 3394, 1, 4096] - - [706, 8373.91] + - [812, 8373.91] - - [4096, 3265, 1, 1024] - - [700, 9700.89] + - [806, 9700.89] - - [1024, 3014, 1, 4096] - - [703, 9303.09] + - [809, 9303.09] - - [4096, 3497, 1, 1024] - - [698, 9668.6] + - [804, 9668.6] - - [4096, 3354, 1, 1024] - - [700, 9294.31] + - [806, 9294.31] - - [4096, 3055, 1, 1024] - - [699, 9780.88] + - [805, 9780.88] - - [1024, 3499, 1, 4096] - - [712, 8527.04] + - [818, 8527.04] - - [1024, 3162, 1, 4096] - - [720, 8059.02] + - [826, 8059.02] - - [4096, 3244, 1, 1024] - - [700, 9636.86] + - [806, 9636.86] - - [1024, 3437, 1, 4096] - - [719, 8583.41] + - [825, 8583.41] - - [1024, 3356, 1, 4096] - - [721, 8296.95] + - [827, 8296.95] - - [4096, 3139, 1, 1024] - - [700, 9338.7] + - [806, 9338.7] - - [4096, 3508, 1, 1024] - - [700, 9700.54] + - [806, 9700.54] - - [1024, 3235, 1, 4096] - - [718, 8314.59] + - [824, 8314.59] - - [1024, 3910, 1, 4096] - - [705, 9200.21] + - [811, 9200.21] - - [4096, 3371, 1, 1024] - - [698, 9336.97] + - [804, 9336.97] - - [1024, 3751, 1, 4096] - - [705, 8827.67] + - [811, 8827.67] - - [4096, 3325, 1, 1024] - - [698, 9845.68] + - [804, 9845.68] - - [1024, 3413, 1, 4096] - - [706, 8345.78] + - [812, 8345.78] - - [1024, 3542, 1, 4096] - - [718, 8521.71] + - [824, 8521.71] - - [18, 18, 3440, 64] - - [734, 1578.24] + - [840, 1578.24] - - [101, 102, 624, 64] - - [732, 4705.28] + - [838, 4705.28] - - [33708, 3900, 1, 1024] - - [698, 9951.05] + - [804, 9951.05] - - [4096, 3525, 1, 1024] - - [699, 9744.47] + - [805, 9744.47] - - [4096, 3382, 1, 1024] - - [699, 9359.03] + - [805, 9359.03] - - [102, 100, 624, 64] - - [733, 4671.51] + - [839, 4671.51] - - [15, 15, 4096, 64] - - [737, 1129.17] + - [843, 1129.17] - - [1024, 3339, 1, 4096] - - [707, 8326.37] + - [813, 8326.37] - - [4096, 3288, 1, 1024] - - [700, 9761.48] + - [806, 9761.48] - - [92, 92, 688, 64] - - [740, 4903.87] + - [846, 4903.87] - - [1024, 3141, 1, 4096] - - [718, 7975.64] + - [824, 7975.64] - - [1024, 3168, 1, 4096] - - [718, 8083.74] + - [824, 8083.74] - - [4096, 3488, 1, 1024] - - [700, 9646.77] + - [806, 9646.77] - - [4096, 3046, 1, 1024] - - [699, 9767.58] + - [805, 9767.58] - - [1024, 3362, 1, 4096] - - [721, 8458.15] + - [827, 8458.15] - - [33708, 3942, 1, 1024] - - [699, 10060.4] + - [805, 10060.4] - - [4096, 3399, 1, 1024] - - [700, 9406.57] + - [806, 9406.57] - - [1024, 3720, 1, 1024] - - [702, 8639.16] + - [808, 8639.16] - - [4096, 3563, 1, 1024] - - [698, 9836.55] + - [804, 9836.55] - - [1024, 3273, 1, 4096] - - [721, 8221.62] + - [827, 8221.62] - - [4096, 3162, 1, 1024] - - [700, 9400.19] + - [806, 9400.19] - - [1024, 3467, 1, 4096] - - [719, 8342.42] + - [825, 8342.42] - - [1024, 3130, 1, 4096] - - [720, 7933.88] + - [826, 7933.88] - - [1024, 3405, 1, 4096] - - [727, 8406.59] + - [833, 8406.59] - - [4096, 3362, 1, 1024] - - [698, 9312.04] + - [804, 9312.04] - - [1024, 3960, 1, 1024] - - [702, 9082.26] + - [808, 9082.26] - - [2048, 128, 1, 4096] - - [752, 5986.62] + - [858, 5986.62] - - [1024, 3712, 1, 36548] - - [750, 9456.25] + - [856, 9456.25] - - [1024, 128, 1, 1024] - - [753, 3631.53] + - [859, 3631.53] - - [3072, 128, 1, 4096] - - [749, 6145.6] + - [855, 6145.6] - - [1024, 3712, 1, 1024] - - [751, 8933.98] + - [857, 8933.98] - - [256, 256, 192, 64] - - [756, 8264.74] + - [862, 8264.74] - - [768, 4096, 1, 768] - - [769, 9642.18] + - [875, 9642.18] - - [768, 64, 1, 768] - - [766, 1850.53] + - [872, 1850.53] - - [768, 1280, 1, 768] - - [769, 8738.23] + - [875, 8738.23] - - [30522, 320, 1, 768] - - [770, 9733.69] + - [876, 9733.69] - - [128, 128, 96, 64] - - [759, 5470.93] + - [865, 5470.93] - - [2, 16, 1, 768] - - [762, 2.57742] + - [868, 2.57742] - - [30522, 1280, 1, 768] - - [768, 10128.0] + - [874, 10128.0] - - [30522, 640, 1, 768] - - [769, 9987.71] + - [875, 9987.71] - - [2, 8, 1, 768] - - [761, 1.06] + - [867, 1.06] - - [768, 4096, 1, 3072] - - [771, 9479.51] + - [877, 9479.51] - - [768, 32, 1, 768] - - [765, 880.434] + - [871, 880.434] - - [2, 64, 1, 768] - - [762, 10.09024] + - [868, 10.09024] - - [256, 256, 96, 64] - - [756, 7614.57] + - [862, 7614.57] - - [64, 64, 768, 64] - - [758, 5354.53] + - [864, 5354.53] - - [30522, 160, 1, 768] - - [767, 7740.21] + - [873, 7740.21] - - [768, 320, 1, 768] - - [760, 5423.77] + - [866, 5423.77] - - [128, 128, 384, 64] - - [757, 7180.08] + - [863, 7180.08] - - [768, 16, 1, 768] - - [763, 706.476] + - [869, 706.476] - - [3072, 4096, 1, 768] - - [772, 9961.84] + - [878, 9961.84] - - [2048, 512, 1, 100] - - [774, 5180.81] + - [880, 5180.81] - - [1024, 200, 1, 560] - - [775, 4061.29] + - [881, 4061.29] - - [256, 1280, 1, 1024] - - [782, 4337.54] + - [888, 4337.54] - - [256, 44505, 1, 1024] - - [818, 8597.79] + - [924, 8597.79] - - [10240, 8976, 1, 256] - - [821, 9471.53] + - [927, 9471.53] - - [256, 7168, 1, 1024] - - [812, 6718.66] + - [918, 6718.66] - - [8448, 8976, 1, 256] - - [804, 9601.41] + - [910, 9601.41] - - [18944, 8976, 1, 256] - - [813, 9666.36] + - [919, 9666.36] - - [256, 19200, 1, 1024] - - [789, 7489.04] + - [895, 7489.04] - - [5632, 8976, 1, 256] - - [801, 9358.49] + - [907, 9358.49] - - [256, 23552, 1, 1024] - - [816, 7980.99] + - [922, 7980.99] - - [256, 6656, 1, 1024] - - [816, 6287.32] + - [922, 6287.32] - - [256, 14336, 1, 1024] - - [811, 7049.36] + - [917, 7049.36] - - [256, 12544, 1, 1024] - - [789, 6728.57] + - [895, 6728.57] - - [2048, 684, 1, 768] - - [806, 8479.28] + - [912, 8479.28] - - [5376, 8976, 1, 256] - - [801, 9519.61] + - [907, 9519.61] - - [256, 5888, 1, 1024] - - [821, 6012.5] + - [927, 6012.5] - - [19968, 8976, 1, 256] - - [813, 9684.77] + - [919, 9684.77] - - [3840, 8976, 1, 256] - - [798, 9461.99] + - [904, 9461.99] - - [4608, 8976, 1, 256] - - [798, 9305.92] + - [904, 9305.92] - - [256, 684, 1, 1024] - - [824, 3513.16] + - [930, 3513.16] - - [256, 22016, 1, 1024] - - [789, 7643.89] + - [895, 7643.89] - - [256, 23296, 1, 1024] - - [818, 8048.22] + - [924, 8048.22] - - [4864, 8976, 1, 256] - - [796, 9545.72] + - [902, 9545.72] - - [256, 7424, 1, 1024] - - [814, 6770.75] + - [920, 6770.75] - - [18176, 8976, 1, 256] - - [821, 9729.57] + - [927, 9729.57] - - [256, 15104, 1, 1024] - - [810, 7289.18] + - [916, 7289.18] - - [8192, 8976, 1, 256] - - [813, 9395.59] + - [919, 9395.59] - - [256, 16128, 1, 1024] - - [813, 7461.38] + - [919, 7461.38] - - [13312, 8976, 1, 256] - - [821, 9551.07] + - [927, 9551.07] - - [256, 21504, 1, 1024] - - [818, 7636.03] + - [924, 7636.03] - - [6400, 8976, 1, 256] - - [805, 9561.06] + - [911, 9561.06] - - [256, 8960, 1, 1024] - - [780, 6292.46] + - [886, 6292.46] - - [1792, 8976, 1, 256] - - [795, 9372.28] + - [901, 9372.28] - - [13824, 8976, 1, 256] - - [813, 9585.37] + - [919, 9585.37] - - [11776, 8976, 1, 256] - - [813, 9560.44] + - [919, 9560.44] - - [256, 20992, 1, 1024] - - [811, 7490.75] + - [917, 7490.75] - - [20480, 8976, 1, 256] - - [821, 9610.8] + - [927, 9610.8] - - [5888, 8976, 1, 256] - - [792, 9565.3] + - [898, 9565.3] - - [256, 10496, 1, 1024] - - [783, 6632.06] + - [889, 6632.06] - - [21248, 8976, 1, 256] - - [813, 9755.87] + - [919, 9755.87] - - [5120, 8976, 1, 256] - - [821, 9244.69] + - [927, 9244.69] - - [7168, 8976, 1, 256] - - [813, 9388.52] + - [919, 9388.52] - - [2048, 1536, 1, 768] - - [802, 9446.14] + - [908, 9446.14] - - [256, 8192, 1, 1024] - - [807, 6948.99] + - [913, 6948.99] - - [4096, 8976, 1, 256] - - [812, 9116.04] + - [918, 9116.04] - - [3328, 8976, 1, 256] - - [805, 9434.65] + - [911, 9434.65] - - [1280, 8976, 1, 256] - - [803, 9129.9] + - [909, 9129.9] - - [2560, 8976, 1, 256] - - [800, 9199.58] + - [906, 9199.58] - - [3072, 8976, 1, 256] - - [815, 8963.7] + - [921, 8963.7] - - [256, 11776, 1, 1024] - - [793, 6869.9] + - [899, 6869.9] - - [18688, 8976, 1, 256] - - [821, 9726.31] + - [927, 9726.31] - - [15104, 8976, 1, 256] - - [821, 9715.81] + - [927, 9715.81] - - [23552, 8976, 1, 256] - - [813, 9648.52] + - [919, 9648.52] - - [6144, 8976, 1, 256] - - [821, 9339.9] + - [927, 9339.9] - - [12544, 8976, 1, 256] - - [821, 9654.55] + - [927, 9654.55] - - [256, 11264, 1, 1024] - - [794, 6815.08] + - [900, 6815.08] - - [2048, 114, 1, 512] - - [825, 4583.6] + - [931, 4583.6] - - [4352, 8976, 1, 256] - - [805, 9471.5] + - [911, 9471.5] - - [15360, 8976, 1, 256] - - [821, 9583.87] + - [927, 9583.87] - - [256, 31488, 1, 1024] - - [820, 8438.11] + - [926, 8438.11] - - [28672, 8976, 1, 256] - - [813, 9688.95] + - [919, 9688.95] - - [256, 18176, 1, 1024] - - [789, 7405.19] + - [895, 7405.19] - - [9728, 8976, 1, 256] - - [821, 9524.25] + - [927, 9524.25] - - [256, 2816, 1, 1024] - - [785, 5405.76] + - [891, 5405.76] - - [256, 18944, 1, 1024] - - [789, 7503.51] + - [895, 7503.51] - - [256, 3584, 1, 1024] - - [788, 6107.25] + - [894, 6107.25] - - [7936, 8976, 1, 256] - - [801, 9608.41] + - [907, 9608.41] - - [19712, 8976, 1, 256] - - [821, 9736.35] + - [927, 9736.35] - - [256, 14848, 1, 1024] - - [794, 7163.52] + - [900, 7163.52] - - [256, 8448, 1, 1024] - - [794, 6372.66] + - [900, 6372.66] - - [256, 6400, 1, 1024] - - [808, 6395.81] + - [914, 6395.81] - - [256, 6144, 1, 1024] - - [819, 6490.32] + - [925, 6490.32] - - [9472, 8976, 1, 256] - - [798, 9610.02] + - [904, 9610.02] - - [256, 9984, 1, 1024] - - [781, 6484.85] + - [887, 6484.85] - - [684, 8976, 1, 256] - - [790, 8128.63] + - [896, 8128.63] - - [20992, 8976, 1, 256] - - [813, 9689.75] + - [919, 9689.75] - - [2048, 684, 1, 512] - - [797, 7241.88] + - [903, 7241.88] - - [2048, 114, 1, 768] - - [823, 4872.56] + - [929, 4872.56] - - [8960, 8976, 1, 256] - - [796, 9603.45] + - [902, 9603.45] - - [2048, 1536, 1, 512] - - [799, 8830.21] + - [905, 8830.21] - - [256, 3328, 1, 1024] - - [787, 5612.65] + - [893, 5612.65] - - [33536, 8976, 1, 256] - - [813, 9797.81] + - [919, 9797.81] - - [2048, 8976, 1, 256] - - [813, 8975.56] + - [919, 8975.56] - - [10496, 8976, 1, 256] - - [804, 9654.53] + - [910, 9654.53] - - [256, 5376, 1, 1024] - - [822, 5626.44] + - [928, 5626.44] - - [256, 21248, 1, 1024] - - [791, 7525.55] + - [897, 7525.55] - - [256, 13312, 1, 1024] - - [789, 6767.21] + - [895, 6767.21] - - [16128, 8976, 1, 256] - - [813, 9715.67] + - [919, 9715.67] - - [2304, 8976, 1, 256] - - [786, 9433.93] + - [892, 9433.93] - - [256, 4864, 1, 1024] - - [776, 5743.65] + - [882, 5743.65] - - [17152, 8976, 1, 256] - - [821, 9709.04] + - [927, 9709.04] - - [15872, 8976, 1, 256] - - [821, 9657.67] + - [927, 9657.67] - - [9984, 8976, 1, 256] - - [798, 9639.84] + - [904, 9639.84] - - [256, 14592, 1, 1024] - - [810, 7224.02] + - [916, 7224.02] - - [256, 33536, 1, 1024] - - [817, 8147.41] + - [923, 8147.41] - - [11264, 8976, 1, 256] - - [813, 9510.06] + - [919, 9510.06] - - [31488, 8976, 1, 256] - - [821, 9799.41] + - [927, 9799.41] - - [256, 20480, 1, 1024] - - [794, 7498.3] + - [900, 7498.3] - - [44505, 8976, 1, 256] - - [805, 9804.88] + - [911, 9804.88] - - [13568, 8976, 1, 256] - - [813, 9680.34] + - [919, 9680.34] - - [256, 11520, 1, 1024] - - [793, 6805.36] + - [899, 6805.36] - - [256, 7936, 1, 1024] - - [809, 6971.87] + - [915, 6971.87] - - [2048, 256, 1, 768] - - [779, 7129.23] + - [885, 7129.23] - - [256, 4608, 1, 1024] - - [777, 5463.01] + - [883, 5463.01] - - [256, 2304, 1, 1024] - - [784, 4842.79] + - [890, 4842.79] - - [256, 2560, 1, 1024] - - [785, 5309.35] + - [891, 5309.35] - - [2816, 8976, 1, 256] - - [796, 9409.66] + - [902, 9409.66] + - - [1728, 320, 1, 64] + - [938, 3205.67] + - - [1152, 128, 1, 784] + - [985, 3499.06] + - - [576, 96, 1, 5329] + - [971, 3948.02] + - - [864, 96, 1, 1225] + - [992, 3009.77] + - - [256, 128, 1, 784] + - [982, 1536.59] + - - [1440, 320, 1, 196] + - [935, 4824.72] + - - [192, 48, 1, 1225] + - [1013, 820.565] + - - [2592, 384, 1, 289] + - [953, 7353.11] + - - [192, 80, 36, 10368] + - [1003, 5360.14] + - - [896, 192, 1, 289] + - [970, 3076.66] + - - [768, 128, 1, 289] + - [995, 2351.91] + - - [64, 256, 1, 3136] + - [1021, 1809.26] + - - [1280, 384, 1, 64] + - [935, 3171.2] + - - [512, 144, 1, 196] + - [993, 1445.17] + - - [1344, 192, 1, 289] + - [976, 4376.62] + - - [288, 64, 1, 21609] + - [987, 3396.22] + - - [400, 32, 1, 784] + - [1014, 922.453] + - - [288, 32, 1, 21609] + - [1025, 2816.11] + - - [1280, 448, 1, 64] + - [938, 3253.66] + - - [3456, 256, 1, 169] + - [950, 5822.54] + - - [2304, 256, 1, 196] + - [948, 4932.08] + - - [384, 192, 1, 1225] + - [996, 2720.49] + - - [832, 48, 1, 49] + - [991, 344.618] + - - [832, 192, 1, 49] + - [973, 1099.46] + - - [1280, 192, 1, 64] + - [974, 2069.66] + - - [192, 32, 1, 784] + - [1013, 459.727] + - - [288, 48, 1, 1225] + - [1020, 1176.1] + - - [512, 112, 1, 196] + - [988, 1277.31] + - - [224, 192, 36, 2592] + - [1005, 7369.66] + - - [528, 32, 1, 196] + - [979, 440.474] + - - [192, 128, 36, 1568] + - [1004, 8245.86] + - - [4032, 384, 1, 64] + - [949, 5898.34] + - - [576, 64, 1, 3136] + - [994, 2671.21] + - - [2048, 32, 1, 1001] + - [996, 2323.1] + - - [480, 64, 1, 196] + - [981, 752.74] + - - [512, 256, 1, 196] + - [983, 2528.65] + - - [864, 96, 1, 289] + - [993, 1958.5] + - - [896, 128, 1, 289] + - [996, 2725.83] + - - [192, 64, 1, 784] + - [1011, 898.775] + - - [1200, 64, 1, 1225] + - [995, 2780.24] + - - [1296, 288, 1, 196] + - [934, 3826.28] + - - [576, 96, 1, 5041] + - [975, 3795.68] + - - [1024, 256, 1, 289] + - [964, 4488.23] + - - [1024, 2048, 1, 49] + - [954, 5077.2] + - - [192, 64, 36, 6272] + - [998, 7515.08] + - - [4096, 512, 1, 4096] + - [960, 10276.1] + - - [192, 32, 1, 1225] + - [1014, 556.786] + - - [1024, 256, 1, 196] + - [974, 3892.54] + - - [1120, 192, 1, 289] + - [963, 3752.91] + - - [400, 48, 1, 196] + - [988, 480.1] + - - [1728, 224, 1, 1225] + - [941, 5575.87] + - - [800, 96, 1, 784] + - [995, 2669.04] + - - [1152, 384, 1, 64] + - [945, 3077.44] + - - [4608, 512, 1, 49] + - [952, 4676.7] + - - [1792, 256, 1, 289] + - [945, 5346.04] + - - [864, 128, 1, 784] + - [995, 3816.3] + - - [1728, 384, 1, 169] + - [947, 5191.78] + - - [480, 16, 1, 196] + - [1016, 241.331] + - - [1568, 256, 1, 289] + - [935, 4723.51] + - - [1152, 448, 1, 64] + - [941, 3356.82] + - - [512, 64, 1, 196] + - [980, 802.916] + - - [1344, 224, 1, 289] + - [935, 3519.73] + - - [9216, 512, 1, 4096] + - [958, 9146.12] + - - [27, 32, 1, 22201] + - [1026, 264.456] + - - [1152, 192, 1, 784] + - [965, 4904.18] + - - [1536, 256, 1, 64] + - [933, 2578.57] + - - [800, 128, 1, 196] + - [995, 1991.21] + - - [800, 64, 1, 196] + - [990, 1150.93] + - - [864, 208, 1, 196] + - [967, 2684.82] + - - [1440, 320, 1, 49] + - [936, 2313.54] + - - [512, 128, 1, 784] + - [986, 2780.42] + - - [720, 192, 1, 5041] + - [961, 5410.56] + - - [256, 64, 1, 784] + - [1018, 1163.6] + - - [256, 48, 1, 1225] + - [1013, 1075.3] + - - [576, 192, 1, 3136] + - [961, 4833.11] + - - [160, 64, 1, 5329] + - [1015, 1753.6] + - - [3456, 384, 1, 289] + - [955, 7341.85] + - - [32, 32, 36, 43808] + - [1009, 1378.13] + - - [1344, 512, 1, 64] + - [934, 3823.03] + - - [192, 16, 1, 784] + - [1014, 228.173] + - - [3456, 384, 1, 169] + - [951, 6675.12] + - - [1152, 256, 1, 196] + - [944, 3211.36] + - - [1728, 192, 1, 1225] + - [945, 4852.36] + - - [2048, 512, 1, 49] + - [957, 3471.74] + - - [576, 96, 1, 1225] + - [988, 2176.76] + - - [512, 2048, 1, 49] + - [939, 3845.93] + - - [1728, 192, 1, 64] + - [934, 2369.93] + - - [832, 256, 1, 49] + - [964, 1433.7] + - - [512, 128, 1, 196] + - [989, 1459.77] + - - [1200, 128, 1, 49] + - [984, 1069.19] + - - [528, 256, 1, 196] + - [972, 2069.86] + - - [256, 512, 1, 784] + - [995, 4538.99] + - - [480, 192, 1, 196] + - [995, 1792.1] + - - [96, 64, 36, 2592] + - [1002, 4845.51] + - - [96, 96, 36, 2592] + - [1007, 5111.63] + - - [1024, 192, 1, 289] + - [969, 3431.24] + - - [1536, 384, 1, 64] + - [940, 3166.94] + - - [192, 96, 1, 784] + - [980, 881.24] + - - [2048, 192, 1, 64] + - [937, 2330.27] + - - [192, 64, 1, 1225] + - [1019, 1100.45] + - - [512, 32, 1, 196] + - [1010, 477.967] + - - [128, 96, 36, 1568] + - [1006, 6649.19] + - - [528, 128, 1, 196] + - [992, 1403.33] + - - [128, 512, 1, 784] + - [982, 2237.91] + - - [128, 128, 36, 3136] + - [999, 6538.87] + - - [528, 160, 1, 196] + - [996, 1642.77] + - - [448, 64, 1, 5329] + - [971, 3264.91] + - - [1280, 320, 1, 64] + - [935, 2777.05] + - - [1792, 320, 1, 289] + - [947, 5205.0] + - - [2880, 320, 1, 64] + - [943, 4337.04] + - - [147, 64, 1, 12544] + - [1024, 2430.37] + - - [4096, 512, 1, 1001] + - [959, 9619.09] + - - [1536, 32, 1, 1001] + - [996, 1757.28] + - - [512, 160, 1, 196] + - [992, 1592.99] + - - [768, 160, 1, 289] + - [993, 2757.27] + - - [1728, 384, 1, 49] + - [945, 3102.59] + - - [64, 32, 36, 43808] + - [1000, 2626.53] + - - [64, 64, 1, 3136] + - [1012, 610.606] + - - [256, 32, 1, 784] + - [1013, 612.937] + - - [480, 96, 1, 196] + - [988, 1055.2] + - - [1024, 32, 1, 1001] + - [978, 1188.53] + - - [832, 160, 1, 49] + - [993, 959.347] + - - [512, 1024, 1, 196] + - [936, 4978.8] + - - [96, 64, 36, 10368] + - [1030, 5001.05] + - - [384, 448, 36, 512] + - [1035, 8903.1] + - - [2048, 64, 1, 1001] + - [1028, 4385.23] + - - [224, 192, 36, 5184] + - [1034, 7487.91] + - - [2048, 128, 1, 1001] + - [1027, 5764.73] + - - [96, 96, 36, 10368] + - [1036, 5275.31] + - - [192, 80, 36, 20736] + - [1032, 5409.5] + - - [96, 64, 36, 5184] + - [1030, 4911.93] + - - [1536, 64, 1, 1001] + - [1029, 3162.13] + - - [96, 64, 36, 20736] + - [1031, 5034.43] + - - [384, 448, 36, 256] + - [1033, 8815.97] + - - [96, 96, 36, 5184] + - [1037, 5236.12] - null diff --git a/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bjlk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bjlk_SB.yaml index b440f65ce..520f17834 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bjlk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bjlk_SB.yaml @@ -47907,6 +47907,29261 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 300 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 301 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 302 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 303 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 304 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 305 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 306 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 307 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 308 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 309 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 310 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 311 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 312 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 313 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 314 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 315 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 316 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 317 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 318 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 319 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 320 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 321 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 322 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 323 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 324 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 325 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 326 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_6_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 327 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 328 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 329 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 330 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 331 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 332 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 333 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 334 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 335 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_DTL0_EPS0_FL0_GRVW4_PGR0_PLR0_TT4_8_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 336 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 337 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 338 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 339 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 340 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 341 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 342 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 343 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 344 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 345 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 346 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 347 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 348 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 349 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 350 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 351 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 352 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 353 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 354 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 355 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 356 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 357 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 358 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 359 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 360 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 361 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 362 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 363 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 364 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 365 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 366 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 367 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 368 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 369 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 370 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 371 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 372 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL1_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 32 + LVCB: 128 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: true + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 373 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x128x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 374 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 375 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 376 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 377 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG8_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 378 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_8_USFGRO0_VW4_WG8_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 379 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 380 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 381 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 382 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 383 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 384 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 385 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 386 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 387 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 388 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 389 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 390 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 391 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 392 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 393 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 394 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 395 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id007 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 396 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 397 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 398 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 399 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 400 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 401 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 402 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 403 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 404 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 405 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 406 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 407 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 408 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 409 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 410 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 411 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 412 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 413 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id007 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 414 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 384 + LdsNumElementsAlignedB: 384 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 384 + LdsOffsetB_Blk: 1408 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 3 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 415 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x24_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 416 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 417 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 418 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 419 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 420 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 421 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 422 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 423 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 424 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 425 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 426 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id012 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 427 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id012 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 428 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 429 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 430 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 431 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 432 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 433 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 434 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id012 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 435 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 436 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 437 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 438 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 439 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 440 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 441 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 442 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 443 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 444 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 445 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id012 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 446 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 447 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 448 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 449 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 450 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 451 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 452 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 453 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 454 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 455 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 456 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 457 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 458 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 459 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 460 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL0_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 461 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id017 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 462 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 463 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 464 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id017 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 465 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 466 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 467 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id020 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 468 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id020 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 469 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 470 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id023 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 471 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id024 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 472 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 473 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id026 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 474 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id020 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 475 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 476 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id023 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 477 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id024 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 478 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 479 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id026 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 480 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id020 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 4 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 481 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id027 + WorkGroupMapping: 1 + WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -47914,7 +77169,752 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 2 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 482 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id027 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 483 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 484 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id031 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 485 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 4 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 486 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id030 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -47924,9 +77924,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -47935,46 +77935,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + KernelLanguage: Assembly + LSCA: 8 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 2 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 832 - LdsNumElementsAlignedA: 256 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 2 + MacroTile0: 8 MacroTile1: 8 - MacroTileA: 32 + MacroTileA: 8 MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -47983,13 +77983,162 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 487 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 4 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 64 PerformanceSyncLocation: -1 @@ -48035,35 +78184,333 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 300 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 - SubGroup0: 16 + SolutionIndex: 488 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id030 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 489 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 SubGroup1: 4 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id001 + ThreadTile: *id028 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id007 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 490 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -48073,10 +78520,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -48084,47 +78531,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 LSPA: 8 LSPB: 8 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -48132,8 +78579,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -48184,48 +78631,48 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 301 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 491 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id001 + ThreadTile: *id028 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id002 + WorkGroup: *id029 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -48233,26 +78680,26 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + KernelLanguage: Assembly + LSCA: 16 LSCB: 16 - LSPA: 4 - LSPB: 8 + LSPA: 16 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 896 + LdsNumElements: 1024 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 512 LdsOffsetB: 256 @@ -48262,17 +78709,17 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -48281,15 +78728,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -48333,35 +78780,34 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 302 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SolutionIndex: 492 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id003 - ThreadTile0: 4 + ThreadTile: *id028 + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id002 + WorkGroup: *id031 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -48371,41 +78817,40 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 2 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -48419,10 +78864,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -48430,15 +78875,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -48482,35 +78925,35 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 303 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionIndex: 493 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id032 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id002 + VectorWidth: 2 + WorkGroup: *id035 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -48520,9 +78963,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -48530,31 +78972,31 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -48568,10 +79010,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -48579,14 +79021,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -48631,47 +79071,46 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 304 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SolutionIndex: 494 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false + DirectToLdsB: true DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -48679,31 +79118,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 8 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 256 + LSPA: 8 + LSPB: 1 LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LVCB: 256 + LVPA: 8 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2304 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -48713,14 +79148,14 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprB: true LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -48728,20 +79163,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -48780,79 +79213,74 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 305 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SolutionIndex: 495 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: *id032 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 + VectorWidth: 4 + WorkGroup: *id033 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false + DirectToLdsB: true DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 256 LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSPB: 1 + LVCA: 32 + LVCB: 256 + LVPA: 8 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2304 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -48862,14 +79290,14 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprB: true LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -48877,20 +79305,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 8 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -48929,79 +79355,74 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 306 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SolutionIndex: 496 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM08 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 32 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: *id033 WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false + DirectToLdsB: true DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 32 - LSCB: 16 - LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 4 + LSCB: 256 + LSPA: 8 + LSPB: 1 + LVCA: 32 + LVCB: 256 + LVPA: 8 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2304 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 LdsOffsetB: 256 - LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -49011,14 +79432,14 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprB: true LoopDoWhile: false LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 256 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 256 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49026,20 +79447,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -49078,79 +79497,74 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 307 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SolutionIndex: 497 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM64 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SubGroupB: 32 + ThreadTile: *id032 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id033 + WorkGroupMapping: 64 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -49163,11 +79577,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49175,20 +79589,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -49227,47 +79639,46 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 308 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 + SolutionIndex: 498 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + ThreadTile: *id032 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id002 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -49275,31 +79686,31 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -49312,11 +79723,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49324,15 +79735,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -49376,35 +79785,35 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 309 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SolutionIndex: 499 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_16_01_WGM01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id035 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -49414,37 +79823,36 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 4 + LSCB: 64 + LSPA: 8 LSPB: 8 - LVCA: 32 + LVCA: 16 LVCB: 16 LVPA: 2 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -49463,9 +79871,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49473,13 +79881,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 PerformanceSyncLocation: -1 @@ -49525,79 +79931,78 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 310 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SolutionIndex: 500 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + ThreadTile: *id032 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 8 LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCB: 32 + LVPA: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3200 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -49610,11 +80015,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49622,15 +80027,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -49674,35 +80077,35 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 311 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 501 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id006 + VectorWidth: 4 + WorkGroup: *id033 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -49712,41 +80115,36 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 LSPB: 8 - LVCA: 8 - LVCB: 8 + LVCA: 16 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -49760,10 +80158,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49771,20 +80169,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -49822,36 +80218,36 @@ TransposeA: false TransposeB: true UseBeta: true - UseInitialStrides: false - SolutionIndex: 312 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + UseInitialStrides: false + SolutionIndex: 502 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 32 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: *id033 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -49861,41 +80257,40 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 - LSPA: 4 - LSPB: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 2 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -49909,10 +80304,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49920,15 +80315,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -49972,35 +80365,35 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 313 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SolutionIndex: 503 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id036 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: *id035 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -50010,8 +80403,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -50028,23 +80420,19 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -50058,10 +80446,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50069,20 +80457,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -50121,35 +80507,35 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 314 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionIndex: 504 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id032 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id002 + WorkGroup: *id035 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -50159,37 +80545,36 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 4 + LSCB: 64 + LSPA: 16 LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -50208,9 +80593,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50218,15 +80603,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -50270,35 +80653,35 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 315 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SolutionIndex: 505 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM08 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SubGroupB: 16 + ThreadTile: *id036 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id035 + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -50308,8 +80691,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -50326,23 +80708,19 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 + LSCB: 128 + LSPA: 16 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 4 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -50357,9 +80735,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50367,20 +80745,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -50419,79 +80795,76 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 316 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SolutionIndex: 506 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM08 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SubGroupB: 16 + ThreadTile: *id032 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id004 - WorkGroupMapping: 1 + WorkGroup: *id035 + WorkGroupMapping: 8 WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 8 + LSCB: 32 LSPA: 4 - LSPB: 16 - LVCA: 32 + LSPB: 8 + LVCA: 16 LVCB: 8 - LVPA: 2 - LVPB: 16 + LVPA: 1 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3200 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -50504,11 +80877,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50516,20 +80889,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -50545,6 +80921,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -50554,6 +80931,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -50568,75 +80946,85 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 317 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 507 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id006 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 LSPA: 4 - LSPB: 16 + LSPB: 4 LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 8 + LVCB: 16 + LVPA: 1 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -50653,11 +81041,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50665,15 +81053,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -50694,6 +81085,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -50703,6 +81095,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -50717,79 +81110,85 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 318 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 508 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id007 + VectorWidth: 4 + WorkGroup: [8, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 LSPB: 8 - LVCA: 8 + LVCA: 16 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 1 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -50802,11 +81201,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50814,20 +81213,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -50843,6 +81245,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -50852,6 +81255,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -50866,79 +81270,89 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 319 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 509 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 384 - LdsNumElementsAlignedB: 384 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 - LdsOffsetB: 384 - LdsOffsetB_Blk: 1408 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -50951,11 +81365,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50963,15 +81377,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -50992,6 +81409,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -51001,6 +81419,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -51015,96 +81434,102 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 320 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x24_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 510 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2560 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -51112,20 +81537,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -51141,6 +81569,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -51150,6 +81579,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -51164,96 +81594,106 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 321 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 511 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -51261,15 +81701,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -51290,6 +81733,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -51299,6 +81743,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -51313,96 +81758,102 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 322 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 512 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2560 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -51410,20 +81861,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -51439,6 +81893,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -51448,6 +81903,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -51462,96 +81918,102 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 323 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 513 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -51559,21 +82021,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -51588,6 +82053,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -51597,6 +82063,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -51611,33 +82078,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 324 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 514 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -51649,58 +82126,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 64 + LSCB: 64 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -51708,20 +82181,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -51737,6 +82213,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -51746,6 +82223,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -51760,33 +82238,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 325 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 515 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -51799,57 +82287,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 + LSCA: 64 + LSCB: 64 + LSPA: 16 LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -51857,15 +82345,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -51886,6 +82377,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -51895,6 +82387,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -51909,33 +82402,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 326 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 516 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -51947,8 +82450,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -51958,7 +82461,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -51966,38 +82469,34 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 32 - LSPA: 16 + LSPA: 4 LSPB: 16 - LVCA: 8 + LVCA: 32 LVCB: 8 - LVPA: 4 + LVPA: 1 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2560 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -52006,20 +82505,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -52035,6 +82537,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -52044,6 +82547,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -52058,17 +82562,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 327 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 517 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -52076,15 +82587,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -52096,8 +82610,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -52107,7 +82621,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -52116,38 +82630,34 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 + LSCB: 64 + LSPA: 16 LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 2 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -52157,19 +82667,22 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -52184,6 +82697,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -52193,6 +82707,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -52202,22 +82717,29 @@ Tensor1: 1 TileA: 0 TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseInitialStrides: false - SolutionIndex: 328 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 8 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 518 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -52225,15 +82747,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -52245,58 +82770,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 64 + LSCB: 64 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -52304,20 +82825,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -52333,6 +82857,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -52342,6 +82867,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -52356,33 +82882,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 329 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 519 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -52395,57 +82931,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 64 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -52453,15 +82989,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -52482,6 +83021,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -52491,6 +83031,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -52505,39 +83046,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 330 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 520 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -52545,7 +83096,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -52553,27 +83104,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -52583,7 +83134,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -52591,10 +83142,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -52602,26 +83153,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -52631,6 +83188,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -52640,6 +83198,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -52654,47 +83213,55 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 331 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 521 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id012 + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -52702,37 +83269,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -52741,9 +83304,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -52751,26 +83314,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -52780,6 +83349,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -52789,6 +83359,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -52803,47 +83374,55 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 332 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 522 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id012 + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -52851,48 +83430,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -52900,26 +83475,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -52929,6 +83510,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -52938,6 +83520,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -52952,39 +83535,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 333 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 523 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -52992,45 +83583,45 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -53038,10 +83629,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -53049,26 +83640,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53078,6 +83675,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -53087,6 +83685,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53101,85 +83700,89 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 334 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 524 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -53187,10 +83790,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -53198,26 +83801,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53227,6 +83836,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -53236,6 +83846,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53250,85 +83861,89 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 335 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 525 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -53336,9 +83951,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -53347,26 +83962,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53376,6 +83997,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -53385,6 +84007,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53399,45 +84022,53 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 336 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 526 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -53447,8 +84078,8 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -53456,38 +84087,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 LVCB: 16 - LVPA: 16 - LVPB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -53496,26 +84123,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53525,6 +84158,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -53534,6 +84168,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53548,47 +84183,55 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 337 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 527 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -53596,47 +84239,43 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 LVCB: 16 - LVPA: 8 - LVPB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -53645,26 +84284,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53674,6 +84319,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -53683,6 +84329,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53697,39 +84344,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 338 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 528 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -53737,7 +84392,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -53745,27 +84400,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -53775,7 +84430,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -53783,10 +84438,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -53794,26 +84449,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53823,6 +84484,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -53832,6 +84494,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53846,39 +84509,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 339 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 529 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id012 + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -53894,8 +84565,8 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -53903,39 +84574,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -53943,26 +84614,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53972,6 +84649,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -53981,6 +84659,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53995,85 +84674,89 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 340 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 530 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -54081,9 +84764,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -54092,26 +84775,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54121,6 +84810,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -54130,6 +84820,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54144,45 +84835,53 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 341 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 531 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id008 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -54192,8 +84891,8 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -54201,39 +84900,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54241,26 +84936,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54270,6 +84971,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -54279,6 +84981,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54293,45 +84996,53 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 342 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 532 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -54341,8 +85052,8 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -54350,38 +85061,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 LVCB: 16 - LVPA: 16 - LVPB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -54390,26 +85097,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54419,6 +85132,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -54428,6 +85142,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54442,47 +85157,55 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 343 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 533 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -54490,48 +85213,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: true - InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 32 + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54539,26 +85258,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54568,6 +85293,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -54577,6 +85303,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54591,85 +85318,89 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 344 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 534 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 4 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -54677,9 +85408,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -54688,26 +85419,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54717,6 +85454,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -54726,6 +85464,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54740,95 +85479,99 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 345 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 535 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -54837,26 +85580,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54866,6 +85615,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -54875,6 +85625,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54889,33 +85640,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 346 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 536 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -54927,58 +85686,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54986,26 +85741,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55015,6 +85776,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -55024,6 +85786,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55038,85 +85801,89 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 347 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 537 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -55124,9 +85891,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -55135,26 +85902,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55164,6 +85937,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -55173,6 +85947,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55187,85 +85962,89 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 348 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 538 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id011 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -55284,26 +86063,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55313,6 +86098,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -55322,6 +86108,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55336,92 +86123,96 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 349 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 539 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id011 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -55433,26 +86224,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55462,6 +86259,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -55471,6 +86269,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55485,85 +86284,89 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 350 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 540 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id012 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 LVPA: 2 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -55571,9 +86374,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -55585,23 +86388,29 @@ NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55611,6 +86420,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -55620,6 +86430,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55634,39 +86445,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 351 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 541 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -55674,56 +86493,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 64 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55731,15 +86550,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -55751,6 +86575,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55760,6 +86585,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -55769,6 +86595,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55783,39 +86610,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 352 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 542 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -55832,7 +86667,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -55840,12 +86675,12 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 64 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false @@ -55861,7 +86696,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -55869,10 +86704,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55880,15 +86715,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -55900,6 +86740,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55909,6 +86750,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -55918,6 +86760,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55932,14 +86775,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 353 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 543 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -55950,78 +86800,79 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56029,15 +86880,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -56049,6 +86903,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56058,6 +86913,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -56067,6 +86923,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56075,45 +86932,55 @@ Tensor0: 0 Tensor1: 1 TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseInitialStrides: false - SolutionIndex: 354 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 544 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -56121,56 +86988,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 128 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56178,15 +87045,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -56198,6 +87070,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56207,6 +87080,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -56216,6 +87090,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56230,39 +87105,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 355 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 545 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -56279,7 +87162,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -56287,39 +87170,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56327,15 +87210,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -56347,6 +87235,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56356,6 +87245,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -56365,6 +87255,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56379,17 +87270,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 356 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 546 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -56397,28 +87295,29 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id011 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -56428,7 +87327,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -56436,16 +87335,16 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 @@ -56457,18 +87356,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56476,15 +87375,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -56496,6 +87398,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56505,6 +87408,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -56514,6 +87418,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56528,14 +87433,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 357 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 547 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 @@ -56546,21 +87458,24 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id011 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -56568,35 +87483,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 + LSCA: 128 + LSCB: 128 LSPA: 8 - LSPB: 32 + LSPB: 8 LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 6400 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -56606,18 +87521,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56625,15 +87540,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -56645,6 +87565,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56654,6 +87575,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -56663,6 +87585,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56677,96 +87600,104 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 358 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 548 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56774,15 +87705,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -56794,6 +87728,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56803,6 +87738,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -56812,6 +87748,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56826,39 +87763,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 359 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 549 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -56866,35 +87813,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 64 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -56904,18 +87851,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56923,15 +87870,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -56943,6 +87895,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56952,6 +87905,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -56961,6 +87915,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56975,39 +87930,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 360 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 550 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -57024,7 +87987,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -57032,39 +87995,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -57072,15 +88035,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -57092,6 +88060,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57101,6 +88070,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -57110,6 +88080,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -57124,46 +88095,54 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 361 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 551 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id011 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -57173,7 +88152,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -57181,39 +88160,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -57221,15 +88200,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -57241,6 +88223,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57250,6 +88233,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -57259,6 +88243,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -57273,39 +88258,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 362 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 552 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id011 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -57313,56 +88308,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -57370,8 +88365,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -57379,6 +88374,11 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -57390,6 +88390,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57399,6 +88400,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -57408,6 +88410,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -57422,96 +88425,104 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 363 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 553 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -57519,15 +88530,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -57539,6 +88553,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57548,6 +88563,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -57557,6 +88573,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -57571,46 +88588,56 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 364 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 554 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -57627,7 +88654,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 128 LSCB: 128 LSPA: 8 @@ -57637,13 +88664,13 @@ LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -57656,7 +88683,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 128 MacroTileA: 128 @@ -57670,13 +88697,18 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -57688,6 +88720,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57697,6 +88730,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -57706,6 +88740,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -57720,35 +88755,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 365 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL0_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 555 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id016 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -57758,8 +88801,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -57776,23 +88819,23 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -57806,10 +88849,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -57817,15 +88860,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -57837,6 +88883,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57846,6 +88893,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -57855,6 +88903,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -57869,35 +88918,45 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 366 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 556 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id017 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id016 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -57907,8 +88966,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -57925,23 +88984,24 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 + LVCB: 16 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -57956,9 +89016,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -57966,26 +89026,31 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57995,6 +89060,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -58004,6 +89070,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -58018,79 +89085,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 367 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 557 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id016 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 - LSCB: 64 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 LSPA: 8 - LSPB: 16 + LSPB: 4 LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + LVCB: 64 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -58103,11 +89181,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -58116,18 +89194,24 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -58135,6 +89219,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58144,6 +89229,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -58153,6 +89239,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -58167,75 +89254,84 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 368 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 558 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -58252,10 +89348,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -58264,26 +89360,33 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58293,6 +89396,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -58302,6 +89406,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -58316,79 +89421,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 369 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 559 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id017 - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id016 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 128 - LSPA: 16 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 LSPB: 8 - LVCA: 16 + LVCA: 64 LVCB: 32 - LVPA: 4 - LVPB: 2 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -58401,11 +89515,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -58414,18 +89528,22 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -58433,6 +89551,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58442,6 +89561,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -58451,6 +89571,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -58465,79 +89586,86 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 370 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 560 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -58550,10 +89678,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -58562,26 +89690,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58591,6 +89727,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -58600,6 +89737,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -58614,33 +89752,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 371 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 561 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -58652,41 +89798,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -58700,10 +89843,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -58711,26 +89854,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58740,6 +89891,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -58749,6 +89901,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -58763,33 +89916,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 372 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 562 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id020 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -58803,39 +89964,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -58849,10 +90011,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -58860,19 +90022,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -58880,6 +90049,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58889,6 +90059,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -58898,6 +90069,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -58912,33 +90084,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 373 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 563 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id020 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -58952,29 +90132,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false @@ -59011,17 +90192,24 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -59029,6 +90217,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59038,6 +90227,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -59047,6 +90237,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -59061,33 +90252,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 374 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 564 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id022 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -59117,23 +90316,24 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 + LVCB: 16 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -59148,9 +90348,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -59158,19 +90358,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -59178,6 +90385,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59187,6 +90395,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -59196,6 +90405,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -59210,39 +90420,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 375 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 565 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id023 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -59250,56 +90468,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 128 - LSPA: 16 - LSPB: 4 - LVCA: 16 - LVCB: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 LVPA: 8 - LVPB: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -59307,26 +90526,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59336,6 +90563,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -59345,6 +90573,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -59359,96 +90588,101 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 376 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 566 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id024 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id021 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 + LSCA: 32 + LSCB: 8 LSPA: 8 - LSPB: 16 + LSPB: 32 LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -59456,26 +90690,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59485,6 +90727,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -59494,6 +90737,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -59508,33 +90752,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 377 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 567 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -59547,7 +90799,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -59560,44 +90812,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 4 + LSCA: 32 + LSCB: 8 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 2 + LVCA: 16 + LVCB: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -59605,26 +90858,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 4 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59634,6 +90893,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -59643,6 +90903,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -59657,75 +90918,86 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 378 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 568 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x16_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id026 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id021 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 + LSCA: 64 + LSCB: 8 LSPA: 8 - LSPB: 8 + LSPB: 32 LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 6400 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -59735,18 +91007,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -59754,19 +91026,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -59774,6 +91051,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59783,6 +91061,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -59792,6 +91071,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -59806,79 +91086,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 379 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 569 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id020 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 96 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -59891,10 +91182,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -59903,19 +91194,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -59923,6 +91219,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59932,6 +91229,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -59941,6 +91239,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -59955,79 +91254,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 380 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 570 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id022 - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 + LSCA: 96 + LSCB: 64 + LSPA: 5 LSPB: 8 - LVCA: 16 + LVCA: 48 LVCB: 32 - LVPA: 4 - LVPB: 2 + LVPA: 3 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -60040,11 +91350,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -60052,19 +91362,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -60072,6 +91387,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60081,6 +91397,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -60090,6 +91407,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -60104,46 +91422,56 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 381 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 571 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id023 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -60160,23 +91488,24 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 128 - LSPA: 16 - LSPB: 4 - LVCA: 16 - LVCB: 64 - LVPA: 8 - LVPB: 2 + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -60189,11 +91518,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -60201,19 +91530,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -60221,6 +91555,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60230,6 +91565,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -60239,6 +91575,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -60253,39 +91590,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 382 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 572 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id024 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -60293,39 +91640,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -60338,11 +91686,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -60353,23 +91701,31 @@ NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60379,6 +91735,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -60388,6 +91745,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -60402,39 +91760,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 383 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 573 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -60458,23 +91824,24 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -60487,11 +91854,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -60499,19 +91866,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -60519,6 +91893,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60528,6 +91903,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -60537,6 +91913,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -60551,79 +91928,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 384 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 574 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id026 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 128 - LSPA: 8 + LSCB: 64 + LSPA: 4 LSPB: 8 - LVCA: 32 + LVCA: 64 LVCB: 32 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -60636,11 +92022,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -60648,19 +92034,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -60668,6 +92059,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60677,6 +92069,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -60686,6 +92079,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -60700,79 +92094,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 385 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 575 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id020 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 4 - LSPB: 4 - LVCA: 16 - LVCB: 16 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -60785,11 +92190,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -60797,19 +92202,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -60817,6 +92229,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60826,6 +92239,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -60835,6 +92249,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -60849,48 +92264,56 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 386 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 576 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id027 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 2 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -60902,26 +92325,27 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 2 - LSPB: 2 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -60934,11 +92358,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -60954,11 +92378,18 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -60966,6 +92397,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60975,6 +92407,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -60984,6 +92417,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -60998,33 +92432,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 387 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 577 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id027 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -61038,7 +92480,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -61050,44 +92492,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 64 + LSCB: 32 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -61095,19 +92538,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -61115,6 +92565,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -61124,6 +92575,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -61133,6 +92585,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -61147,39 +92600,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 388 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: true - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 578 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id029 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -61195,7 +92656,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -61203,23 +92664,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3328 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 + LdsOffsetA_Blk: 2048 LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -61232,11 +92694,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -61247,16 +92709,23 @@ NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -61264,6 +92733,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -61273,6 +92743,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -61282,6 +92753,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -61296,14 +92768,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 389 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id028 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 579 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 @@ -61314,21 +92793,22 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id031 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -61344,31 +92824,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 + LSCA: 16 + LSCB: 32 + LSPA: 16 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -61381,11 +92862,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -61393,19 +92874,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -61413,6 +92901,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -61422,6 +92911,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -61431,6 +92921,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -61445,39 +92936,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 390 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 580 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 4 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id029 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -61493,48 +92992,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 - LSCB: 16 - LSPA: 4 - LSPB: 4 + LSCB: 32 + LSPA: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 16 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 16 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -61542,19 +93042,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -61562,6 +93069,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -61571,6 +93079,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -61580,6 +93089,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -61594,17 +93104,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 391 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 581 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 @@ -61612,21 +93129,22 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id030 - WorkGroupMapping: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -61642,31 +93160,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 8 - LVCA: 8 + LSPB: 32 + LVCA: 32 LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -61679,10 +93198,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 8 - MacroTileA: 8 + MacroTileA: 32 MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -61693,17 +93212,24 @@ NonTemporalC: 0 NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -61711,6 +93237,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -61720,6 +93247,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -61729,6 +93257,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -61743,14 +93272,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 392 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 582 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW1_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 4 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id028 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 @@ -61760,22 +93296,23 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id029 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -61783,7 +93320,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -61791,37 +93328,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 - LSPA: 4 - LSPB: 4 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 LVPA: 4 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -61829,9 +93367,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -61841,25 +93379,33 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -61869,6 +93415,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -61878,6 +93425,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -61892,33 +93440,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 393 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 583 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id030 + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -61940,48 +93496,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -61989,19 +93546,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -62009,6 +93573,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -62018,6 +93583,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -62027,6 +93593,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -62041,33 +93608,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 394 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 584 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id029 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -62081,8 +93656,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -62090,47 +93665,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 8 + LSCA: 128 + LSCB: 64 + LSPA: 4 LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -62138,19 +93714,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -62158,6 +93741,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -62167,6 +93751,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -62176,6 +93761,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -62190,33 +93776,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 395 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 585 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -62229,9 +93823,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -62239,47 +93833,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 + LSCA: 128 + LSCB: 64 + LSPA: 4 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -62287,26 +93882,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -62316,6 +93917,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -62325,6 +93927,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -62339,33 +93942,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 396 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 586 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id029 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -62379,65 +93992,68 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 64 + LSCB: 64 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -62445,17 +94061,25 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -62465,6 +94089,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -62474,8 +94099,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -62488,77 +94115,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 397 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 587 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id031 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -62571,29 +94209,40 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -62601,6 +94250,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -62610,6 +94260,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -62619,8 +94270,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -62633,78 +94286,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 398 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 588 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id032 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id035 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -62717,7 +94380,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -62725,21 +94388,30 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -62747,6 +94419,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -62756,6 +94429,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -62765,8 +94439,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -62779,74 +94455,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 399 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 589 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id032 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: true + DepthU: 16 + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 256 - LSPA: 8 - LSPB: 1 - LVCA: 32 - LVCB: 256 - LVPA: 8 - LVPB: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -62856,39 +94548,49 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -62898,6 +94600,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -62907,8 +94610,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -62921,74 +94626,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 400 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id032 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 590 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id033 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: true + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 256 + LSCA: 64 + LSCB: 96 LSPA: 8 - LSPB: 1 + LSPB: 5 LVCA: 32 - LVCB: 256 - LVPA: 8 - LVPB: 1 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -62998,39 +94719,49 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -63040,6 +94771,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -63049,8 +94781,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -63063,74 +94797,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 401 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM08 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id032 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 591 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id033 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: true + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 256 + LSCA: 64 + LSCB: 96 LSPA: 8 - LSPB: 1 + LSPB: 5 LVCA: 32 - LVCB: 256 - LVPA: 8 - LVPB: 1 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -63140,39 +94890,49 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -63182,6 +94942,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -63191,8 +94952,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -63205,74 +94968,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 402 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM64 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id032 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 592 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id033 - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -63285,36 +95064,46 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -63324,6 +95113,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -63333,8 +95123,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -63347,78 +95139,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 403 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 593 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id032 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 + LSCB: 128 + LSPA: 8 LSPB: 4 - LVCA: 64 + LVCA: 32 LVCB: 64 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -63431,29 +95235,40 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -63461,6 +95276,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -63470,6 +95286,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -63479,8 +95296,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -63493,33 +95312,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 404 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 594 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id036 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id035 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -63531,40 +95358,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 128 LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -63579,27 +95408,36 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -63607,6 +95445,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -63616,6 +95455,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -63625,8 +95465,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -63639,13 +95481,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 405 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 595 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id032 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 @@ -63655,17 +95505,19 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -63677,40 +95529,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 128 LSPA: 8 - LSPB: 8 + LSPB: 4 LVCA: 32 - LVCB: 32 - LVPA: 8 + LVCB: 64 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -63724,28 +95578,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -63753,6 +95616,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -63762,6 +95626,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -63771,8 +95636,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -63785,74 +95652,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 406 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG08_32_01_WGM01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id036 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 596 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id033 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 + LSCA: 128 + LSCB: 64 + LSPA: 4 LSPB: 8 - LVCA: 16 + LVCA: 64 LVCB: 32 - LVPA: 4 - LVPB: 2 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -63865,36 +95748,48 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -63904,6 +95799,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -63913,8 +95809,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -63927,12 +95825,20 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 407 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 597 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -63943,58 +95849,60 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id033 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -64011,29 +95919,40 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -64041,6 +95960,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -64050,6 +95970,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -64059,8 +95980,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -64073,74 +95996,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 408 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 598 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id036 - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id035 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 + LSCA: 128 + LSCB: 64 + LSPA: 4 LSPB: 8 - LVCA: 16 + LVCA: 64 LVCB: 32 - LVPA: 4 - LVPB: 2 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -64153,36 +96090,46 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -64192,6 +96139,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -64201,8 +96149,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -64215,70 +96165,82 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 409 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 599 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id032 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id035 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -64299,29 +96261,40 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -64329,6 +96302,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -64338,6 +96312,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -64347,8 +96322,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -64361,74 +96338,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 410 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM08 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 600 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id036 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id035 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -64441,36 +96432,48 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -64480,6 +96483,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -64489,8 +96493,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -64503,32 +96509,39 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 411 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM08 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 601 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR0_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id032 - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id035 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -64536,14 +96549,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -64560,50 +96573,60 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 - LVPA: 1 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -64613,13 +96636,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -64642,6 +96667,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -64658,20 +96684,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 412 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM1 + SolutionIndex: 602 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -64679,16 +96705,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -64696,14 +96720,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -64720,54 +96744,60 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 LVCB: 16 - LVPA: 1 - LVPB: 1 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -64777,6 +96807,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -64784,6 +96815,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -64806,6 +96838,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -64822,37 +96855,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 413 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM8 + SolutionIndex: 603 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -64860,13 +96891,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -64884,50 +96915,58 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 - LVPA: 1 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -64937,13 +96976,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -64966,6 +97007,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -64982,20 +97024,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 414 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM64 + SolutionIndex: 604 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -65003,16 +97045,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -65020,7 +97062,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -65044,54 +97086,58 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 LVCB: 16 - LVPA: 1 - LVPB: 1 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -65101,6 +97147,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -65108,6 +97155,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65130,6 +97178,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -65146,37 +97195,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 415 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM64 + SolutionIndex: 605 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -65184,14 +97233,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -65208,25 +97257,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 - LSPA: 4 - LSPB: 16 - LVCA: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 LVCB: 8 - LVPA: 1 - LVPB: 4 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -65234,24 +97288,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -65261,13 +97320,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65290,6 +97351,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -65306,8 +97368,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 416 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 606 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -65316,10 +97378,10 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -65327,16 +97389,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -65344,7 +97404,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -65368,29 +97428,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -65399,23 +97460,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -65425,6 +97489,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -65432,6 +97497,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65454,6 +97520,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -65470,16 +97537,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 417 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 607 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -65491,10 +97558,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -65514,53 +97581,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 1 - LVPB: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -65569,13 +97641,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -65585,13 +97660,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65630,8 +97707,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 418 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM8 + SolutionIndex: 608 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -65640,27 +97717,25 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 1 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -65668,60 +97743,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -65729,13 +97809,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -65745,13 +97828,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65790,37 +97875,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 419 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 609 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -65834,37 +97917,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -65878,10 +97966,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -65889,13 +97977,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -65905,13 +97996,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65950,8 +98043,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 420 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 610 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -65960,23 +98053,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -65988,47 +98079,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -66041,11 +98133,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -66053,13 +98145,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -66069,6 +98164,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -66076,6 +98172,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66114,8 +98211,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 421 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 611 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -66123,28 +98220,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -66158,53 +98253,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 16 LSCB: 32 - LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 1 - LVPB: 4 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -66213,13 +98313,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -66229,13 +98332,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66274,37 +98379,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 422 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 612 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x16_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -66312,60 +98415,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -66373,13 +98481,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -66389,13 +98498,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66434,15 +98545,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 423 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 613 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -66454,17 +98565,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -66478,53 +98589,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 4 + LVCB: 32 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -66533,13 +98649,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -66549,13 +98666,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66594,14 +98713,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 424 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 614 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -66614,9 +98733,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -66624,7 +98743,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -66639,36 +98758,37 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -66678,18 +98798,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -66697,13 +98817,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -66713,6 +98836,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -66720,6 +98844,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66758,37 +98883,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 425 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 615 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -66796,7 +98919,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -66804,39 +98927,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -66849,7 +98973,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -66862,14 +98986,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -66879,8 +99004,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -66925,8 +99051,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 426 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 616 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -66934,22 +99060,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -66961,13 +99087,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -66985,6 +99111,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 @@ -66995,9 +99122,13 @@ LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -67010,7 +99141,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -67024,13 +99155,14 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -67040,7 +99172,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -67086,8 +99219,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 427 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 617 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -67110,11 +99243,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -67122,13 +99255,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -67142,10 +99275,11 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 @@ -67156,15 +99290,19 @@ LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -67184,14 +99322,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -67201,7 +99340,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -67247,35 +99387,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 428 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 618 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -67283,41 +99423,42 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 - LVPA: 4 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 @@ -67329,7 +99470,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -67337,10 +99478,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67348,15 +99489,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -67366,8 +99506,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -67412,35 +99553,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 429 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM1 + SolutionIndex: 619 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -67448,49 +99591,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -67498,10 +99646,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67509,15 +99657,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -67527,8 +99676,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -67573,35 +99723,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 430 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 620 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -67609,43 +99759,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -67658,11 +99813,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67671,14 +99826,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -67688,7 +99842,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -67734,15 +99889,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 431 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 621 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -67750,19 +99905,21 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -67770,13 +99927,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -67790,29 +99947,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 + LSCA: 32 + LSCB: 32 + LSPA: 8 LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 + LVCA: 32 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -67820,10 +99982,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67831,15 +99993,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -67849,8 +100012,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -67895,35 +100059,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 432 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 622 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -67931,13 +100095,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -67951,23 +100115,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 + LSCB: 64 + LSPA: 4 + LSPB: 4 LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -67980,11 +100149,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67992,15 +100161,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -68010,7 +100180,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -68056,35 +100227,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 433 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 623 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -68099,30 +100270,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false @@ -68158,14 +100330,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -68175,8 +100346,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -68221,8 +100393,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 434 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM8 + SolutionIndex: 624 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -68230,26 +100402,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -68257,7 +100431,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -68265,45 +100439,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 - LVPA: 4 - LVPB: 2 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -68311,10 +100486,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68322,15 +100497,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -68340,8 +100516,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -68386,35 +100563,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 435 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM8 + SolutionIndex: 625 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -68428,37 +100605,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 LVCB: 32 - LVPA: 2 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -68473,9 +100655,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68484,14 +100666,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -68501,8 +100684,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -68547,15 +100731,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 436 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + SolutionIndex: 626 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -68563,19 +100747,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -68583,43 +100767,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -68632,11 +100821,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68645,14 +100834,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -68662,7 +100850,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -68708,15 +100897,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 437 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + SolutionIndex: 627 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -68724,19 +100913,21 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -68744,13 +100935,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -68764,23 +100955,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 + LSCB: 64 + LSPA: 4 + LSPB: 4 LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -68793,11 +100989,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68805,15 +101001,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -68823,7 +101020,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -68869,35 +101067,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 438 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + SolutionIndex: 628 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -68905,49 +101103,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -68956,9 +101159,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68966,15 +101169,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -68984,8 +101188,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -69030,15 +101235,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 439 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 629 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -69046,19 +101251,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -69072,54 +101277,59 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 LVCB: 16 - LVPA: 2 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69128,14 +101338,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -69145,8 +101356,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -69191,8 +101403,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 440 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 630 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -69201,25 +101413,25 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -69233,37 +101445,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 LVCA: 64 - LVCB: 16 + LVCB: 64 LVPA: 2 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69277,10 +101494,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69288,15 +101505,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -69306,7 +101524,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -69352,35 +101571,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 441 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 631 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW2_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -69388,13 +101607,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -69404,43 +101623,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 4 + LSPA: 8 LSPB: 8 - LVCA: 64 + LVCA: 32 LVCB: 32 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -69449,15 +101673,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -69467,7 +101692,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -69513,14 +101739,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 442 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + SolutionIndex: 632 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO1_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 2] @@ -69533,15 +101759,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -69549,13 +101775,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -69573,25 +101799,30 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 2 - LSPB: 4 + LSPA: 4 + LSPB: 8 LVCA: 64 LVCB: 32 - LVPA: 2 - LVPB: 4 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -69610,15 +101841,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -69628,8 +101860,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -69674,8 +101907,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 443 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + SolutionIndex: 633 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -69695,14 +101928,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -69716,37 +101949,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 LVCB: 32 - LVPA: 2 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69761,9 +101999,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69772,14 +102010,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -69789,7 +102028,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -69835,15 +102075,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 444 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + SolutionIndex: 634 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -69851,19 +102091,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -69871,60 +102111,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69932,15 +102177,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -69950,8 +102194,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -69996,15 +102241,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 445 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + SolutionIndex: 635 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -70012,19 +102257,21 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -70038,7 +102285,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -70056,19 +102303,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 + LSCB: 64 + LSPA: 4 LSPB: 4 LVCA: 64 - LVCB: 32 - LVPA: 2 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -70083,9 +102335,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70095,13 +102347,14 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -70111,7 +102364,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -70157,15 +102411,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 446 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + SolutionIndex: 636 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -70178,14 +102432,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -70193,14 +102447,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -70217,40 +102471,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 + LSCB: 32 + LSPA: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70258,15 +102513,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -70276,6 +102530,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -70322,20 +102577,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 447 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 637 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -70343,14 +102598,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -70358,7 +102615,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -70366,56 +102623,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70423,15 +102681,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -70441,6 +102700,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -70487,31 +102747,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 448 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + SolutionIndex: 638 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -70523,43 +102783,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 + LSCA: 32 + LSCB: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -70569,18 +102830,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70588,13 +102849,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -70604,6 +102868,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -70650,37 +102915,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 449 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + SolutionIndex: 639 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -70688,7 +102951,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -70696,56 +102959,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70753,8 +103017,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -70762,6 +103026,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -70771,6 +103036,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -70817,35 +103083,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 450 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM1 + SolutionIndex: 640 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -70861,56 +103127,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 8 + LSCA: 32 + LSCB: 32 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 2 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70918,15 +103185,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -70936,6 +103204,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -70982,35 +103251,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 451 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 641 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -71025,36 +103294,37 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -71073,9 +103343,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71083,13 +103353,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -71099,6 +103372,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -71145,8 +103419,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 452 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 642 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -71154,28 +103428,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -71183,7 +103455,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -71191,39 +103463,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -71236,11 +103509,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71248,8 +103521,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -71257,6 +103530,7 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -71266,6 +103540,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -71312,8 +103587,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 453 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + SolutionIndex: 643 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -71321,14 +103596,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -71336,11 +103611,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -71348,7 +103623,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -71356,56 +103631,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71413,13 +103689,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -71429,6 +103706,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -71475,37 +103753,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 454 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + SolutionIndex: 644 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -71513,64 +103791,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71578,15 +103857,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -71596,6 +103874,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -71642,35 +103921,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 455 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM8 + SolutionIndex: 645 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -71686,56 +103967,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71743,15 +104025,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -71761,6 +104044,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -71807,35 +104091,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 456 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + SolutionIndex: 646 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -71850,57 +104134,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 + LSCA: 32 + LSCB: 64 LSPA: 16 LSPB: 8 LVCA: 16 LVCB: 32 - LVPA: 4 - LVPB: 2 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71908,13 +104193,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -71924,6 +104212,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -71970,37 +104259,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 457 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + SolutionIndex: 647 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -72016,39 +104303,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 32 - LVCB: 16 - LVPA: 2 + LVCB: 32 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -72062,9 +104350,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72073,15 +104361,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -72091,6 +104380,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -72137,8 +104427,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 458 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + SolutionIndex: 648 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -72146,18 +104436,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -72165,7 +104455,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -72173,47 +104463,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -72226,11 +104517,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72238,13 +104529,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -72254,6 +104548,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -72300,8 +104595,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 459 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + SolutionIndex: 649 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -72309,28 +104604,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -72338,7 +104631,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -72346,56 +104639,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 + LSCA: 64 + LSCB: 32 + LSPA: 4 LSPB: 8 - LVCA: 32 + LVCA: 64 LVCB: 32 - LVPA: 2 - LVPB: 2 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72403,15 +104697,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -72421,6 +104716,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -72467,35 +104763,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 460 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + SolutionIndex: 650 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -72503,64 +104799,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 + LSCA: 32 + LSCB: 32 LSPA: 8 LSPB: 8 LVCA: 32 LVCB: 32 - LVPA: 2 - LVPB: 2 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72568,13 +104865,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -72584,6 +104884,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -72630,33 +104931,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 461 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + SolutionIndex: 651 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -72668,40 +104967,40 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -72722,11 +105021,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72734,13 +105033,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -72752,7 +105054,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -72797,8 +105099,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 462 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WGM8 + SolutionIndex: 652 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -72807,23 +105109,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -72843,40 +105143,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 128 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -72890,9 +105190,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72901,15 +105201,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -72966,8 +105267,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 463 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WGM8 + SolutionIndex: 653 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -72976,17 +105277,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -72994,7 +105295,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -73022,28 +105323,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 2 - LSPB: 4 - LVCA: 128 - LVCB: 64 - LVPA: 2 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -73057,10 +105358,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73068,15 +105369,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -73088,7 +105390,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -73133,8 +105435,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 464 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WGM8 + SolutionIndex: 654 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -73143,19 +105445,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -73169,44 +105471,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -73216,7 +105518,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -73224,10 +105526,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73235,13 +105537,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -73298,37 +105603,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 465 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WGM8 + SolutionIndex: 655 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -73336,13 +105639,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -73356,30 +105659,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -73387,10 +105694,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73398,14 +105705,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -73418,7 +105725,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -73464,31 +105771,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 466 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 656 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -73506,7 +105813,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -73520,24 +105827,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 32 LSPA: 4 - LSPB: 4 + LSPB: 8 LVCA: 64 - LVCB: 64 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -73552,9 +105863,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73562,14 +105873,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -73582,7 +105893,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -73628,8 +105939,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 467 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 657 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -73638,17 +105949,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -73664,7 +105975,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -73684,45 +105995,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 16 LSPA: 4 - LSPB: 4 + LSPB: 16 LVCA: 64 - LVCB: 64 + LVCB: 16 LVPA: 4 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73730,13 +106041,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -73796,31 +106107,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 468 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 658 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -73832,48 +106143,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -73886,7 +106197,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -73899,13 +106210,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -73964,8 +106273,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 469 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 659 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -73980,15 +106289,17 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -74007,37 +106318,37 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 + LSCB: 32 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -74047,18 +106358,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74066,13 +106377,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -74132,15 +106441,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 470 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 660 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -74152,15 +106461,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -74195,17 +106506,17 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 8 + LSCB: 16 LSPA: 8 - LSPB: 32 + LSPB: 16 LVCA: 32 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -74224,9 +106535,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74234,14 +106545,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -74255,7 +106566,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -74300,28 +106611,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 471 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + SolutionIndex: 661 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 1 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -74336,13 +106647,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -74356,30 +106667,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -74387,10 +106702,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74398,14 +106713,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -74418,7 +106733,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -74464,35 +106779,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 472 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 + SolutionIndex: 662 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -74500,48 +106815,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 4 - LVPA: 8 + LVCB: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -74554,11 +106869,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74566,12 +106881,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -74585,7 +106902,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -74630,33 +106947,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 473 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x16_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 663 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 1 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -74668,54 +106983,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 + LSCB: 32 LSPA: 8 - LSPB: 32 + LSPB: 8 LVCA: 32 - LVCB: 4 + LVCB: 32 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -74724,9 +107039,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74734,11 +107049,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -74798,15 +107115,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 474 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 664 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 2] ThreadTile0: 4 @@ -74819,12 +107136,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -74843,41 +107158,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 64 LSCB: 64 - LSPA: 5 - LSPB: 8 - LVCA: 48 - LVCB: 32 - LVPA: 3 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -74891,9 +107206,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -74902,12 +107217,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -74966,8 +107283,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 475 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 665 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -74976,13 +107293,13 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -74991,8 +107308,6 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -75020,7 +107335,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -75030,22 +107345,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 64 LSCB: 64 - LSPA: 5 + LSPA: 8 LSPB: 8 - LVCA: 48 + LVCA: 32 LVCB: 32 - LVPA: 3 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -75059,9 +107374,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -75070,11 +107385,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -75134,8 +107449,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 476 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 666 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -75144,10 +107459,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -75159,7 +107474,7 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -75179,7 +107494,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -75188,7 +107503,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -75199,21 +107514,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 96 + LSCB: 64 LSPA: 8 - LSPB: 5 + LSPB: 8 LVCA: 32 - LVCB: 48 + LVCB: 32 LVPA: 4 - LVPB: 3 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -75228,9 +107543,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75238,12 +107553,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -75302,8 +107619,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 477 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 667 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -75312,11 +107629,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -75327,8 +107644,6 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -75340,7 +107655,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -75348,57 +107663,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 - LVPA: 4 - LVPB: 2 + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 6656 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 4608 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75406,13 +107721,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 @@ -75427,7 +107742,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -75472,31 +107787,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 478 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 668 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -75508,7 +107823,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -75534,14 +107849,14 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 3584 LdsNumElementsAlignedA: 512 @@ -75555,7 +107870,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -75563,10 +107878,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75574,8 +107889,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -75640,31 +107955,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 479 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 669 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -75683,37 +107998,37 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 64 - LSPA: 4 + LSCB: 128 + LSPA: 8 LSPB: 8 - LVCA: 64 + LVCA: 32 LVCB: 32 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -75732,9 +108047,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75742,11 +108057,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -75806,8 +108123,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 480 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 670 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW4_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -75816,23 +108133,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -75844,7 +108159,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -75860,7 +108175,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -75870,28 +108185,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 8 + LSPA: 16 LSPB: 8 - LVCA: 32 + LVCA: 16 LVCB: 32 - LVPA: 4 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -75899,9 +108214,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -75910,14 +108225,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -75976,14 +108291,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 481 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 671 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -75997,10 +108312,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -76020,36 +108335,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 + LSCB: 16 + LSPA: 4 LSPB: 16 - LVCA: 16 + LVCA: 64 LVCB: 16 LVPA: 4 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3328 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -76059,18 +108374,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76078,13 +108393,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -76144,29 +108459,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 482 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 672 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -76180,48 +108495,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -76234,7 +108549,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -76247,12 +108562,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -76312,8 +108625,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 483 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + SolutionIndex: 673 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -76332,11 +108645,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -76356,56 +108671,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 8 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 8 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -76414,14 +108729,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -76480,35 +108795,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 484 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 674 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -76523,41 +108838,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -76571,9 +108886,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -76582,14 +108897,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -76648,31 +108961,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 485 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM8 + SolutionIndex: 675 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -76684,48 +108999,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 16 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -76738,11 +109053,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76752,12 +109067,10 @@ NonTemporalC: 0 NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -76816,15 +109129,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 486 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM8 + SolutionIndex: 676 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 2] ThreadTile0: 4 @@ -76832,19 +109145,21 @@ ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -76860,57 +109175,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 + LSCA: 64 + LSCB: 32 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76918,13 +109233,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -76984,28 +109299,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 487 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW1_WG16_4_4_WGM8 + SolutionIndex: 677 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -77020,7 +109335,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -77028,36 +109343,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -77074,11 +109389,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77087,13 +109402,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -77107,7 +109422,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -77152,15 +109467,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 488 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 678 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -77172,11 +109487,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 4, 4] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -77188,7 +109503,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -77214,18 +109529,18 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 2 - LSPB: 4 - LVCA: 128 - LVCB: 64 - LVPA: 2 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -77235,7 +109550,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -77243,10 +109558,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77254,14 +109569,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -77320,20 +109635,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 489 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 679 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -77341,14 +109656,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -77364,40 +109679,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 LSPA: 4 - LSPB: 8 + LSPB: 4 LVCA: 64 - LVCB: 32 - LVPA: 2 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -77411,9 +109726,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -77422,14 +109737,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -77488,8 +109803,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 490 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 680 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -77498,19 +109813,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -77524,14 +109839,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -77550,18 +109865,18 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 4 + LSPA: 8 LSPB: 8 - LVCA: 64 + LVCA: 32 LVCB: 32 - LVPA: 2 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -77578,10 +109893,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -77590,12 +109905,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -77609,7 +109926,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -77654,8 +109971,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 491 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 681 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -77664,10 +109981,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -77676,11 +109993,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -77700,36 +110015,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 + LSCB: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 LVPA: 4 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3328 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -77739,34 +110054,32 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -77781,7 +110094,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -77810,7 +110123,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -77827,28 +110139,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 492 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 682 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -77871,36 +110183,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 + LSCB: 32 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -77910,34 +110222,32 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -77981,7 +110291,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -77998,15 +110307,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 493 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 683 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -78018,9 +110327,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -78034,48 +110343,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -78088,7 +110397,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -78096,18 +110405,18 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -78150,7 +110459,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -78167,8 +110475,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 494 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 684 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -78183,17 +110491,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -78205,40 +110511,40 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -78252,33 +110558,33 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -78321,7 +110627,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -78338,15 +110643,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 495 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 685 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -78358,13 +110663,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -78376,7 +110679,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -78403,27 +110706,27 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 96 + LSCB: 32 LSPA: 8 - LSPB: 5 + LSPB: 16 LVCA: 32 - LVCB: 48 + LVCB: 16 LVPA: 4 - LVPB: 3 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -78432,24 +110735,22 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -78463,7 +110764,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -78492,7 +110793,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -78509,33 +110809,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 496 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 686 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -78547,48 +110847,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 96 - LSPA: 8 - LSPB: 5 - LVCA: 32 - LVCB: 48 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 - LVPB: 3 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -78601,26 +110901,26 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -78663,7 +110963,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -78680,8 +110979,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 497 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 687 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -78690,23 +110989,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -78718,7 +111015,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -78726,46 +111023,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 96 - LSPA: 8 - LSPB: 5 - LVCA: 32 - LVCB: 48 - LVPA: 4 - LVPB: 3 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -78773,25 +111070,23 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -78834,7 +111129,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -78851,33 +111145,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 498 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 688 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -78889,14 +111183,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -78915,56 +111209,52 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -79007,7 +111297,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -79024,31 +111313,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 499 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 689 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -79060,76 +111351,76 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -79176,7 +111467,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -79193,33 +111483,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 500 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 690 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -79231,14 +111519,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -79257,28 +111545,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -79286,25 +111574,25 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -79347,7 +111635,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -79364,15 +111651,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 501 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 691 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_8_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 @@ -79385,12 +111672,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -79402,7 +111687,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -79428,28 +111713,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -79457,27 +111742,25 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -79520,7 +111803,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -79537,15 +111819,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 502 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 692 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 @@ -79558,10 +111840,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -79573,7 +111855,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -79599,56 +111881,54 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 4 + LSPA: 8 LSPB: 8 - LVCA: 64 + LVCA: 32 LVCB: 32 - LVPA: 2 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -79691,7 +111971,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -79708,14 +111987,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 503 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 693 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [8, 4] @@ -79729,10 +112008,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -79744,14 +112023,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -79770,28 +112049,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -79799,25 +112078,25 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -79860,7 +112139,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -79877,15 +112155,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 504 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 694 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 @@ -79898,12 +112176,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -79941,22 +112217,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -79970,10 +112246,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79983,14 +112259,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -80004,7 +112280,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -80040,6 +112316,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -80050,8 +112327,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 505 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 695 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -80060,11 +112337,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -80072,7 +112349,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -80092,7 +112369,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -80106,28 +112383,24 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -80141,10 +112414,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80154,14 +112427,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -80174,8 +112447,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -80211,6 +112484,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -80221,8 +112495,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 506 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR0_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 696 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -80231,19 +112505,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -80257,7 +112531,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -80265,56 +112539,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -80325,13 +112599,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -80382,6 +112656,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -80392,14 +112667,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 507 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + SolutionIndex: 697 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -80408,15 +112683,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -80428,7 +112703,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -80436,56 +112711,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -80496,13 +112771,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -80553,6 +112828,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -80563,14 +112839,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 508 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + SolutionIndex: 698 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -80579,15 +112855,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -80599,54 +112875,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -80654,9 +112926,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -80667,12 +112939,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -80685,7 +112959,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -80722,6 +112996,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -80732,14 +113007,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 509 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + SolutionIndex: 699 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -80748,17 +113023,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -80770,54 +113043,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -80825,9 +113094,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -80838,12 +113107,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -80856,7 +113127,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -80893,6 +113164,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -80903,14 +113175,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 510 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + SolutionIndex: 700 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -80919,17 +113191,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -80941,7 +113211,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -80949,57 +113219,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81009,14 +113279,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -81030,7 +113300,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -81066,6 +113336,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -81076,15 +113347,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 511 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + SolutionIndex: 701 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -81092,15 +113363,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -81112,65 +113383,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81180,12 +113447,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -81198,8 +113467,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -81235,6 +113504,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -81245,15 +113515,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 512 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + SolutionIndex: 702 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -81261,21 +113531,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -81283,7 +113551,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -81303,34 +113571,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 4 - LSPB: 8 + LSPB: 4 LVCA: 64 - LVCB: 32 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -81339,23 +113607,25 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -81370,7 +113640,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -81399,12 +113669,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -81415,15 +113687,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 513 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM8 + SolutionIndex: 703 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -81435,11 +113707,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -81451,54 +113723,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -81506,25 +113774,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -81537,7 +113807,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -81567,12 +113837,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -81583,31 +113855,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 514 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 704 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -81619,7 +113891,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -81645,22 +113917,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -81673,26 +113945,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -81735,12 +114009,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -81751,8 +114027,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 515 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 705 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -81761,11 +114037,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -81773,13 +114049,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -81787,13 +114063,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -81807,28 +114083,24 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 4 - LSPB: 8 + LSPB: 4 LVCA: 64 - LVCB: 32 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -81841,26 +114113,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -81873,7 +114147,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -81903,12 +114177,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -81919,8 +114195,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 516 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 706 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -81929,21 +114205,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -81975,60 +114251,62 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -82071,12 +114349,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -82087,28 +114367,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 517 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x16_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM1 + SolutionIndex: 707 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -82123,78 +114403,78 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -82207,7 +114487,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -82237,12 +114517,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -82253,15 +114535,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 518 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 708 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -82269,17 +114551,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -82291,14 +114571,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -82317,28 +114597,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 - LSPA: 16 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 32 LVCB: 32 - LVPA: 8 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -82346,23 +114626,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -82405,12 +114689,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -82421,14 +114707,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 519 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + SolutionIndex: 709 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -82442,12 +114728,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -82459,7 +114743,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -82467,46 +114751,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 4 + LSCB: 64 + LSPA: 8 LSPB: 8 - LVCA: 64 + LVCA: 32 LVCB: 32 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -82515,24 +114799,26 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -82575,12 +114861,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -82591,15 +114879,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 520 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_8_2_WGM8 + SolutionIndex: 710 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -82607,15 +114895,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -82627,7 +114915,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -82635,8 +114923,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -82647,7 +114935,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 @@ -82655,20 +114943,20 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -82681,7 +114969,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -82689,12 +114977,14 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -82743,12 +115033,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -82759,8 +115051,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 521 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 711 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -82779,15 +115071,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -82795,7 +115087,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -82803,40 +115095,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -82849,7 +115141,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -82857,18 +115149,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -82911,12 +115205,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -82927,8 +115223,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 522 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 712 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -82943,15 +115239,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -82971,36 +115267,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -83010,33 +115306,35 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -83079,12 +115377,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -83095,15 +115395,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 523 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 + SolutionIndex: 713 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -83111,13 +115411,13 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -83138,71 +115438,75 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 8 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -83245,12 +115549,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -83261,14 +115567,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 524 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 + SolutionIndex: 714 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -83281,13 +115587,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -83299,14 +115603,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -83325,28 +115629,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -83354,24 +115658,24 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -83415,12 +115719,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -83431,31 +115737,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 525 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 715 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -83529,7 +115837,9 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -83581,12 +115891,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -83597,8 +115909,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 526 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 716 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -83635,7 +115947,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -83662,27 +115974,27 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 64 LSPA: 8 - LSPB: 8 + LSPB: 4 LVCA: 32 - LVCB: 32 + LVCB: 64 LVPA: 8 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -83690,25 +116002,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -83751,12 +116065,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -83767,20 +116083,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 527 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM1 + SolutionIndex: 717 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -83788,10 +116104,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -83803,7 +116119,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -83823,28 +116139,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 4 + LSPA: 8 LSPB: 4 - LVCA: 64 + LVCA: 32 LVCB: 64 - LVPA: 4 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -83857,26 +116173,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -83919,12 +116237,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -83935,8 +116255,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 528 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 718 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -83945,21 +116265,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -83987,7 +116307,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -83997,22 +116317,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 96 LSCB: 64 - LSPA: 8 + LSPA: 5 LSPB: 8 - LVCA: 32 + LVCA: 48 LVCB: 32 - LVPA: 4 + LVPA: 3 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -84026,22 +116346,24 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -84085,12 +116407,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -84101,8 +116425,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 529 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 719 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84111,10 +116435,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -84126,7 +116450,7 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -84139,54 +116463,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -84194,24 +116518,24 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -84255,12 +116579,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -84271,31 +116597,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 530 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 + SolutionIndex: 720 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -84315,40 +116643,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 8 + LSCB: 32 + LSPA: 4 LSPB: 8 - LVCA: 32 + LVCA: 64 LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3328 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -84363,23 +116691,25 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -84423,12 +116753,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -84439,8 +116771,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 531 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 721 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_6_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84449,19 +116781,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -84475,7 +116807,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84483,40 +116815,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 LVPA: 4 - LVPB: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -84529,24 +116861,26 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -84589,12 +116923,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -84605,8 +116941,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 532 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 722 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84615,23 +116951,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -84643,48 +116979,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 LVPA: 4 - LVPB: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -84697,26 +117033,26 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -84759,12 +117095,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -84775,8 +117113,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 533 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 723 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84785,21 +117123,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -84811,7 +117151,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84819,72 +117159,74 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 LVPA: 4 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -84898,7 +117240,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -84927,12 +117269,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -84943,31 +117287,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 534 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + SolutionIndex: 724 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -84979,7 +117323,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84987,46 +117331,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 LVPA: 4 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -85035,24 +117379,26 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -85066,7 +117412,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -85095,12 +117441,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -85111,31 +117459,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 535 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + SolutionIndex: 725 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -85147,7 +117495,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -85173,22 +117521,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 128 - LSPA: 4 + LSPA: 8 LSPB: 4 - LVCA: 64 + LVCA: 32 LVCB: 64 - LVPA: 2 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -85201,26 +117549,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -85234,7 +117584,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -85263,12 +117613,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -85279,8 +117631,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 536 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW2_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 726 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -85289,10 +117641,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -85301,9 +117653,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -85323,72 +117675,74 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 128 LSPA: 8 - LSPB: 8 + LSPB: 4 LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 64 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -85402,7 +117756,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -85431,12 +117785,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -85447,28 +117803,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 537 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO1_VW2_WG8_16_2_WGM8 + SolutionIndex: 727 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -85483,7 +117839,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -85491,46 +117847,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -85539,23 +117895,25 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -85599,12 +117957,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -85615,31 +117975,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 538 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + SolutionIndex: 728 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -85667,7 +118027,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -85678,21 +118038,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 128 LSPA: 8 - LSPB: 8 + LSPB: 4 LVCA: 32 - LVCB: 32 + LVCB: 64 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -85707,24 +118067,26 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -85767,12 +118129,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -85783,8 +118147,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 539 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 729 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -85793,11 +118157,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -85819,14 +118183,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -85845,28 +118209,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -85874,19 +118234,23 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -85903,8 +118267,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -85933,12 +118297,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -85949,33 +118315,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 540 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + SolutionIndex: 730 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_DTL0_EPS0_FL0_GRVW4_PGR0_PLR0_TT4_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -85994,41 +118358,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 + LSCB: 128 + LSPA: 8 LSPB: 4 - LVCA: 64 + LVCA: 32 LVCB: 64 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -86043,23 +118407,23 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -86103,12 +118467,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -86119,8 +118485,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 541 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 731 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86129,21 +118495,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -86155,7 +118523,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -86163,70 +118531,72 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -86269,12 +118639,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -86285,31 +118657,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 542 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + SolutionIndex: 732 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -86323,7 +118695,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -86343,34 +118715,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -86378,25 +118750,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -86410,7 +118784,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -86439,12 +118813,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -86455,35 +118831,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 543 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM8 + SolutionIndex: 733 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -86491,7 +118867,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -86511,20 +118887,20 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 @@ -86538,7 +118914,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -86546,19 +118922,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -86578,7 +118956,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -86607,12 +118985,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -86623,31 +119003,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 544 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM8 + SolutionIndex: 734 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -86659,7 +119039,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -86667,46 +119047,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -86714,24 +119094,26 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -86775,12 +119157,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -86791,31 +119175,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 545 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 735 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -86827,7 +119211,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -86853,53 +119237,55 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -86914,7 +119300,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -86943,12 +119329,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -86959,20 +119347,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 546 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 736 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -86980,10 +119368,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -86995,48 +119383,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -87049,26 +119437,26 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -87111,12 +119499,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -87127,8 +119517,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 547 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 737 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87137,21 +119527,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -87163,13 +119555,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -87183,28 +119575,24 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 4 + LSPA: 8 LSPB: 4 - LVCA: 64 + LVCA: 32 LVCB: 64 - LVPA: 4 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -87217,26 +119605,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -87249,7 +119639,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -87279,12 +119669,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -87295,8 +119687,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 548 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 738 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87305,21 +119697,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -87331,78 +119723,78 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 8 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -87415,7 +119807,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -87445,12 +119837,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -87461,33 +119855,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 549 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 739 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -87499,15 +119891,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -87515,38 +119907,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -87554,23 +119946,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -87613,12 +120009,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -87629,15 +120027,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 550 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 740 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 4] ThreadTile0: 2 @@ -87650,12 +120048,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -87687,7 +120083,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -87729,12 +120125,14 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -87783,12 +120181,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -87799,8 +120199,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 551 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM8 + SolutionIndex: 741 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87819,9 +120219,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -87843,30 +120243,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 64 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 8 LVPB: 4 LdcEqualsLdd: false @@ -87897,18 +120297,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -87951,12 +120353,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -87967,8 +120371,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 552 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + SolutionIndex: 742 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87983,11 +120387,11 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -88003,7 +120407,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88011,72 +120415,74 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 32 LSPA: 8 LSPB: 8 LVCA: 32 LVCB: 32 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -88119,12 +120525,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -88135,31 +120543,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 553 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 743 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88171,54 +120579,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -88226,25 +120634,25 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -88287,12 +120695,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -88303,31 +120713,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 554 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 744 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88339,7 +120751,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88375,12 +120787,12 @@ LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -88393,7 +120805,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -88401,18 +120813,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -88455,12 +120869,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -88471,8 +120887,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 555 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + SolutionIndex: 745 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88495,7 +120911,7 @@ WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88527,28 +120943,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 8 LSCB: 32 - LSPA: 8 + LSPA: 32 LSPB: 8 - LVCA: 32 + LVCA: 8 LVCB: 32 - LVPA: 8 + LVPA: 32 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -88562,24 +120978,26 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 @@ -88623,12 +121041,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -88639,29 +121059,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 556 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM8 + SolutionIndex: 746 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 @@ -88675,54 +121095,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -88730,25 +121150,25 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -88791,12 +121211,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -88807,31 +121229,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 557 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 747 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88843,7 +121267,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88852,71 +121276,73 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -88959,12 +121385,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -88975,35 +121403,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 558 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 748 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -89011,7 +121439,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89031,34 +121459,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 4 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -89066,25 +121494,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -89127,12 +121557,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -89143,31 +121575,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 559 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 749 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89179,7 +121611,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89187,40 +121619,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89233,19 +121665,21 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 @@ -89295,12 +121729,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -89311,35 +121747,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 560 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 + SolutionIndex: 750 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -89347,7 +121783,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89355,7 +121791,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -89363,32 +121799,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89401,26 +121837,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -89463,12 +121901,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -89479,31 +121919,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 561 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM8 + SolutionIndex: 751 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89515,7 +121955,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89523,7 +121963,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -89531,63 +121971,65 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -89631,12 +122073,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -89647,31 +122091,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 562 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 752 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89690,31 +122134,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 LVPA: 4 LVPB: 16 LdcEqualsLdd: false @@ -89730,14 +122174,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 16 MacroTileA: 64 @@ -89745,18 +122189,18 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -89799,12 +122243,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -89815,31 +122261,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 563 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM8 + SolutionIndex: 753 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89857,42 +122305,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 LVCB: 32 - LVPA: 4 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89907,23 +122351,27 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -89935,7 +122383,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -89965,12 +122413,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -89981,15 +122431,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 564 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 754 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -89997,17 +122447,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90019,54 +122467,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -90081,17 +122525,21 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -90103,7 +122551,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -90133,12 +122581,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -90149,8 +122599,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 565 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + SolutionIndex: 755 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -90165,21 +122615,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -90187,81 +122635,79 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 16 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -90273,7 +122719,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -90303,12 +122749,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -90319,31 +122767,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 566 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM1 + SolutionIndex: 756 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90381,22 +122829,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 + LSCA: 64 + LSCB: 64 LSPA: 2 LSPB: 2 - LVCA: 128 - LVCB: 128 + LVCA: 64 + LVCB: 64 LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -90410,26 +122858,28 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -90442,7 +122892,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -90471,12 +122921,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -90487,20 +122939,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 567 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 757 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -90508,14 +122960,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -90523,7 +122975,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -90543,34 +122995,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -90578,26 +123030,28 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -90610,7 +123064,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -90639,12 +123093,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -90655,31 +123111,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 568 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM1 + SolutionIndex: 758 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90699,7 +123155,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -90707,28 +123163,28 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 8 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -90747,25 +123203,27 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -90778,7 +123236,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -90807,12 +123265,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -90823,29 +123283,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 569 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 759 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -90867,32 +123327,32 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 @@ -90921,19 +123381,21 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -90946,7 +123408,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -90975,12 +123437,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -90991,28 +123455,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 570 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 760 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -91034,9 +123498,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -91047,7 +123511,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 @@ -91057,10 +123521,10 @@ LSCB: 64 LSPA: 8 LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 @@ -91089,17 +123553,21 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -91112,7 +123580,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -91141,12 +123609,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -91157,33 +123627,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 571 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 761 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -91202,9 +123670,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -91215,7 +123683,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 @@ -91225,10 +123693,10 @@ LSCB: 64 LSPA: 8 LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 @@ -91257,19 +123725,19 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -91282,7 +123750,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -91311,12 +123779,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -91327,31 +123797,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 572 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 762 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -91363,81 +123835,81 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 64 - LSPA: 32 + LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 16 - LVPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 6656 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 4608 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -91479,12 +123951,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -91495,31 +123969,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 573 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_16_2_WGM1 + SolutionIndex: 763 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -91531,81 +124007,79 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 - DirectToLds: false - DirectToLdsA: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -91617,7 +124091,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -91647,12 +124121,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -91663,31 +124139,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 574 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 764 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -91699,48 +124175,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 + LSCA: 32 + LSCB: 64 LSPA: 8 - LSPB: 8 - LVCA: 32 + LSPB: 4 + LVCA: 16 LVCB: 32 - LVPA: 2 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -91753,27 +124225,29 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -91785,7 +124259,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -91815,12 +124289,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -91831,31 +124307,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 575 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW4_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 765 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -91867,14 +124343,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -91895,26 +124371,26 @@ KernelLanguage: Assembly LSCA: 32 LSCB: 64 - LSPA: 16 - LSPB: 8 + LSPA: 8 + LSPB: 4 LVCA: 16 LVCB: 32 - LVPA: 8 - LVPB: 4 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -91929,19 +124405,19 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -91983,12 +124459,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -91999,8 +124477,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 576 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 + SolutionIndex: 766 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92020,10 +124498,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92041,75 +124521,71 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 LVPA: 4 - LVPB: 16 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -92121,7 +124597,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -92151,12 +124627,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -92167,31 +124645,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 577 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM1 + SolutionIndex: 767 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL1_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92203,79 +124683,79 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 - DirectToLds: false + DepthU: 8 + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false + DirectToLdsB: true DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 32 + LVCB: 128 LVPA: 4 - LVPB: 8 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprB: true LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularB: 8 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -92287,7 +124767,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -92317,12 +124797,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -92333,33 +124815,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 578 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + SolutionIndex: 768 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x128x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92371,7 +124851,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -92379,73 +124859,75 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 8 - LSPB: 16 - LVCA: 32 + LSPB: 8 + LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -92458,7 +124940,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -92487,12 +124969,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -92503,31 +124987,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 579 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + SolutionIndex: 769 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] + VectorWidth: 4 + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92539,14 +125023,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -92565,28 +125049,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -92594,24 +125078,28 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -92624,7 +125112,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -92653,12 +125141,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -92669,20 +125159,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 580 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 + SolutionIndex: 770 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -92690,12 +125180,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92707,7 +125195,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -92715,36 +125203,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -92754,32 +125242,34 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -92792,7 +125282,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -92821,12 +125311,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -92837,31 +125329,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 581 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 771 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -92875,81 +125367,81 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularB: 4 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -92991,12 +125483,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -93007,31 +125501,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 582 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + SolutionIndex: 772 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [8, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93043,54 +125539,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 32 + LVPA: 2 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -93098,26 +125594,26 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -93130,7 +125626,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -93159,12 +125655,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -93175,31 +125673,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 583 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 773 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_8_USFGRO0_VW4_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93211,7 +125711,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -93237,28 +125737,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -93266,25 +125766,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -93298,7 +125800,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -93327,31 +125829,35 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 584 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_8_4_WGM1 + SolutionIndex: 774 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -93364,14 +125870,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -93399,7 +125905,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -93441,12 +125947,14 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -93466,7 +125974,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -93495,24 +126003,28 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 585 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 775 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -93531,9 +126043,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -93553,42 +126065,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -93609,18 +126117,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -93633,7 +126143,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -93663,24 +126173,28 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 586 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 776 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -93695,11 +126209,11 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -93723,7 +126237,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -93731,28 +126245,28 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -93762,33 +126276,35 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -93831,44 +126347,48 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 587 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG16_8_2_WGM8 + SolutionIndex: 777 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -93883,7 +126403,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -93891,46 +126411,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 LVPA: 4 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -93939,24 +126459,26 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -93999,47 +126521,51 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 588 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + SolutionIndex: 778 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94071,28 +126597,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 128 LSPA: 4 - LSPB: 4 + LSPB: 2 LVCA: 64 - LVCB: 64 + LVCB: 128 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -94107,24 +126633,26 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -94138,7 +126666,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -94167,24 +126695,28 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 589 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 779 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -94193,19 +126725,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -94219,7 +126751,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -94227,46 +126759,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -94274,25 +126806,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -94306,7 +126840,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -94335,47 +126869,51 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 590 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 780 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94387,7 +126925,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -94414,27 +126952,27 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 128 LSPA: 8 - LSPB: 16 + LSPB: 4 LVCA: 32 - LVCB: 16 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -94443,22 +126981,24 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -94472,7 +127012,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -94501,47 +127041,51 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 591 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + SolutionIndex: 781 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -94555,48 +127099,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -94609,26 +127153,26 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -94671,24 +127215,28 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 592 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 782 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -94697,21 +127245,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94723,7 +127273,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -94731,46 +127281,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -94778,23 +127328,25 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -94837,47 +127389,51 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 593 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 + SolutionIndex: 783 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -94891,54 +127447,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 64 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -94946,22 +127498,26 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 @@ -94975,7 +127531,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -95005,49 +127561,51 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 594 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + SolutionIndex: 784 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95059,7 +127617,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -95085,55 +127643,57 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 + LSPA: 8 + LSPB: 8 + LVCA: 16 LVCB: 16 - LVPA: 8 - LVPB: 4 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -95146,7 +127706,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -95175,47 +127735,51 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 595 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + SolutionIndex: 785 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95227,7 +127791,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -95235,46 +127799,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 - LSPA: 16 + LSPA: 8 LSPB: 8 LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 4 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -95282,26 +127846,28 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -95314,7 +127880,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -95343,30 +127909,34 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 596 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_8_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 786 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 8] @@ -95379,11 +127949,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95395,81 +127965,79 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 - DirectToLds: false - DirectToLdsA: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -95481,7 +128049,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -95511,30 +128079,34 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 597 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 787 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [8, 4] @@ -95543,15 +128115,15 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95563,7 +128135,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -95571,19 +128143,19 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 @@ -95593,31 +128165,31 @@ LSCB: 64 LSPA: 8 LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -95625,19 +128197,21 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -95650,7 +128224,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -95679,24 +128253,28 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 598 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_16_2_WGM8 + SolutionIndex: 788 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -95715,11 +128293,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95731,54 +128309,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 8 - LSPB: 16 - LVCA: 32 + LSPB: 8 + LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -95787,25 +128365,25 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -95818,7 +128396,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -95847,31 +128425,35 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 599 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 789 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 @@ -95883,11 +128465,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - [2, 3, 0, 1] - - - [1024, 128, 1, 128] - [4, 1028.02] @@ -97961,8 +130545,6 @@ - [100, 9061.26] - - [49, 2048, 128, 512] - [98, 6963.26] - - - [784, 512, 64, 128] - - [100, 8822.52] - - [784, 128, 128, 512] - [107, 8983.53] - - [196, 256, 64, 1024] @@ -97973,36 +130555,22 @@ - [99, 8581.25] - - [49, 2048, 256, 512] - [98, 7049.54] - - - [196, 1024, 64, 256] - - [101, 7953.59] - - [784, 128, 256, 512] - [109, 9102.89] - - [196, 256, 128, 1024] - [101, 8085.79] - - - [3136, 64, 64, 256] - - [105, 9266.03] - - - [784, 128, 64, 512] - - [106, 8809.29] - - - [49, 2048, 64, 512] - - [98, 6843.85] - - [3136, 64, 128, 256] - [105, 9381.29] - - [3136, 256, 128, 64] - [103, 8982.54] - - [784, 512, 128, 128] - [100, 8965.89] - - - [3136, 256, 64, 64] - - [103, 8879.7] - - [3136, 64, 256, 256] - [105, 9566.33] - - - [3136, 64, 64, 64] - - [104, 8313.95] - - [3136, 64, 256, 64] - [99, 8743.7] - - [196, 1024, 128, 256] - [102, 8119.33] - - - [49, 512, 64, 2048] - - [110, 7055.31] - - [49, 512, 256, 2048] - [111, 7166.31] - - [196, 1024, 256, 256] @@ -100875,4990 +133443,5482 @@ - [280, 4689.35] - - [64, 64, 36, 1760] - [230, 5622.24] + - - [196, 528, 32, 32] + - [313, 4088.41] + - - [5329, 64, 32, 80] + - [306, 8331.14] + - - [64, 2880, 1, 320] + - [357, 4362.6] + - - [49, 832, 32, 256] + - [320, 5618.63] + - - [3136, 64, 64, 64] + - [306, 8457.65] + - - [196, 512, 32, 24] + - [307, 3621.73] + - - [289, 1120, 1, 160] + - [303, 3302.86] + - - [1225, 192, 32, 32] + - [311, 6194.57] + - - [64, 2048, 32, 384] + - [334, 9541.54] + - - [1001, 1536, 1, 32] + - [305, 3575.67] + - - [289, 1792, 1, 320] + - [328, 5140.33] + - - [3136, 256, 64, 64] + - [329, 9310.12] + - - [1001, 1024, 1, 32] + - [300, 2733.4] + - - [196, 480, 32, 64] + - [361, 5070.42] + - - [64, 1728, 1, 320] + - [358, 3205.57] + - - [49, 832, 32, 160] + - [362, 4988.82] + - - [49, 2048, 64, 512] + - [332, 7370.31] + - - [49, 832, 32, 384] + - [320, 5901.95] + - - [289, 896, 1, 192] + - [346, 3452.59] + - - [289, 1024, 32, 384] + - [365, 8902.42] + - - [784, 192, 32, 96] + - [376, 7853.63] + - - [50176, 256, 1, 128] + - [339, 9041.83] + - - [289, 1024, 32, 256] + - [374, 8660.72] + - - [289, 1024, 32, 192] + - [363, 8433.35] + - - [12544, 512, 1, 256] + - [323, 9187.34] + - - [1225, 1728, 1, 192] + - [327, 7720.85] + - - [196, 480, 32, 96] + - [372, 5662.5] + - - [196, 512, 32, 144] + - [366, 6531.38] + - - [784, 400, 1, 32] + - [301, 1280.0] + - - [289, 768, 32, 128] + - [367, 7913.61] + - - [5329, 576, 1, 96] + - [310, 7563.46] + - - [49, 1200, 1, 128] + - [354, 1011.61] + - - [64, 1536, 32, 256] + - [368, 9159.54] + - - [289, 2592, 1, 384] + - [336, 6002.71] + - - [196, 528, 32, 128] + - [371, 5987.1] + - - [64, 2048, 32, 448] + - [334, 9669.87] + - - [196, 1024, 64, 256] + - [373, 7818.94] + - - [5329, 448, 1, 64] + - [306, 6201.02] + - - [784, 256, 32, 64] + - [308, 7623.18] + - - [784, 192, 32, 32] + - [313, 5874.26] + - - [21609, 288, 1, 32] + - [326, 5296.5] + - - [784, 256, 32, 32] + - [304, 6235.46] + - - [5041, 720, 1, 192] + - [322, 8140.98] + - - [289, 2016, 1, 256] + - [319, 5404.05] + - - [196, 512, 32, 128] + - [364, 6366.82] + - - [289, 768, 32, 160] + - [366, 8253.88] + - - [64, 1536, 32, 384] + - [337, 9508.5] + - - [64, 1280, 32, 320] + - [337, 9070.73] + - - [289, 896, 1, 128] + - [347, 2917.68] + - - [289, 3456, 1, 384] + - [327, 7274.91] + - - [196, 800, 1, 64] + - [349, 1393.78] + - - [64, 1280, 32, 384] + - [333, 9225.01] + - - [64, 1344, 1, 512] + - [352, 3041.45] + - - [1001, 4096, 1, 512] + - [333, 9391.77] + - - [1225, 192, 32, 64] + - [306, 7729.29] + - - [64, 1152, 1, 384] + - [356, 2440.65] + - - [729, 1600, 1, 192] + - [318, 6827.71] + - - [289, 1344, 1, 192] + - [316, 4439.04] + - - [784, 192, 32, 16] + - [343, 3663.04] + - - [3136, 1024, 1, 2048] + - [325, 9071.77] + - - [64, 1152, 1, 448] + - [353, 2564.45] + - - [49, 832, 32, 128] + - [316, 4733.16] + - - [784, 256, 32, 128] + - [329, 8471.6] + - - [49, 800, 1, 128] + - [351, 633.535] + - - [196, 512, 32, 32] + - [313, 4354.26] + - - [1225, 384, 32, 96] + - [330, 8751.63] + - - [5041, 576, 1, 96] + - [312, 7067.63] + - - [49, 832, 32, 48] + - [345, 3316.72] + - - [3136, 64, 64, 256] + - [367, 9721.9] + - - [5329, 160, 32, 64] + - [369, 8159.84] + - - [1225, 288, 32, 48] + - [359, 6673.65] + - - [4096, 9216, 1, 512] + - [341, 10116.9] + - - [196, 480, 32, 192] + - [370, 6388.46] + - - [64, 1152, 1, 256] + - [357, 1982.6] + - - [3136, 1024, 1, 512] + - [325, 8745.57] + - - [49, 832, 32, 32] + - [344, 2717.87] + - - [784, 192, 32, 64] + - [308, 7216.32] + - - [289, 1024, 32, 128] + - [331, 7970.5] + - - [289, 768, 32, 192] + - [375, 8327.27] + - - [289, 1120, 1, 192] + - [315, 3716.9] + - - [196, 512, 32, 112] + - [321, 6252.81] + - - [1001, 2048, 1, 32] + - [309, 4000.09] + - - [1225, 288, 32, 64] + - [369, 7208.04] + - - [196, 600, 1, 64] + - [348, 1093.95] + - - [1225, 384, 32, 192] + - [330, 9332.66] + - - [50176, 256, 1, 512] + - [340, 9833.54] + - - [196, 512, 32, 160] + - [367, 6614.34] + - - [4096, 4096, 1, 512] + - [338, 10032.2] + - - [49, 832, 32, 192] + - [316, 5244.53] + - - [1225, 256, 32, 64] + - [306, 7972.35] + - - [64, 2048, 32, 320] + - [334, 9404.27] + - - [196, 480, 32, 16] + - [360, 2724.49] + - - [1225, 256, 32, 48] + - [308, 7100.38] + - - [64, 1280, 32, 448] + - [333, 9344.41] + - - [1225, 1200, 1, 64] + - [302, 5157.89] + - - [1225, 384, 32, 64] + - [306, 8219.96] + - - [12544, 512, 1, 1024] + - [325, 9672.72] + - - [64, 1280, 32, 192] + - [321, 8525.01] + - - [196, 512, 32, 64] + - [306, 5489.34] + - - [289, 1792, 1, 256] + - [324, 4831.61] + - - [196, 528, 32, 256] + - [342, 6453.82] + - - [49, 512, 64, 2048] + - [377, 7548.98] + - - [64, 2048, 32, 192] + - [329, 8955.81] + - - [784, 512, 64, 128] + - [329, 9160.73] + - - [784, 128, 64, 512] + - [336, 9280.69] + - - [196, 528, 32, 160] + - [370, 6161.15] + - - [1225, 192, 32, 48] + - [306, 7236.92] + - - [64, 1728, 1, 192] + - [356, 2480.57] + - - [1001, 2048, 1, 64] + - [382, 5714.42] + - - [5329, 64, 128, 80] + - [389, 8835.29] + - - [64, 1280, 128, 448] + - [387, 10020.5] + - - [289, 768, 128, 128] + - [390, 8542.71] + - - [1225, 192, 128, 64] + - [379, 8444.77] + - - [1225, 288, 128, 48] + - [392, 7244.66] + - - [289, 768, 128, 192] + - [394, 8794.49] + - - [289, 768, 128, 160] + - [391, 8705.33] + - - [64, 2048, 128, 192] + - [385, 9780.26] + - - [64, 1280, 128, 384] + - [388, 9950.9] + - - [1225, 256, 128, 48] + - [380, 8273.61] + - - [1225, 192, 128, 48] + - [380, 8140.32] + - - [1225, 288, 128, 64] + - [392, 7886.21] + - - [64, 1280, 128, 320] + - [384, 9894.56] + - - [1225, 256, 128, 64] + - [385, 8572.51] + - - [1001, 2048, 1, 128] + - [386, 7289.06] + - - [1225, 192, 128, 32] + - [381, 7104.57] + - - [64, 1280, 128, 192] + - [393, 9642.08] + - - [1001, 1536, 1, 64] + - [383, 5146.56] - - [1024, 128, 1, 128] - - [304, 1028.12] + - [399, 1028.12] - - [4, 704, 1, 1280] - - [343, 363.455] + - [438, 363.455] - - [4, 1856, 1, 3328] - - [343, 579.534] + - [438, 579.534] - - [1856, 448, 1, 3328] - - [380, 6966.83] + - [475, 6966.83] - - [2944, 4288, 1, 1280] - - [375, 9057.98] + - [470, 9057.98] - - [2368, 64, 1, 3328] - - [336, 5837.66] + - [431, 5837.66] - - [2368, 5888, 1, 256] - - [380, 9111.16] + - [475, 9111.16] - - [128, 64, 1, 256] - - [342, 374.591] + - [437, 374.591] - - [5888, 1024, 1, 1280] - - [385, 8570.54] + - [480, 8570.54] - - [128, 6784, 1, 3328] - - [348, 7703.96] + - [443, 7703.96] - - [64, 4, 1, 256] - - [394, 11.3219] + - [489, 11.3219] - - [5888, 1856, 1, 3328] - - [380, 9394.4] + - [475, 9394.4] - - [5056, 704, 1, 256] - - [383, 8026.99] + - [478, 8026.99] - - [5888, 2944, 1, 3328] - - [373, 7608.21] + - [468, 7608.21] - - [1856, 4288, 1, 256] - - [374, 8986.42] + - [469, 8986.42] - - [1024, 5056, 1, 128] - - [366, 3898.34] + - [461, 3898.34] - - [5056, 5056, 1, 3328] - - [374, 9536.85] + - [469, 9536.85] - - [1408, 5888, 1, 1280] - - [375, 9279.19] + - [470, 9279.19] - - [2368, 448, 1, 128] - - [367, 2474.42] + - [462, 2474.42] - - [1024, 3584, 1, 3328] - - [377, 9258.58] + - [472, 9258.58] - - [4, 2944, 1, 1280] - - [329, 611.84] + - [424, 611.84] - - [1408, 64, 1, 128] - - [300, 858.31] + - [395, 858.31] - - [256, 4288, 1, 3328] - - [380, 7616.08] + - [475, 7616.08] - - [5888, 1408, 1, 1280] - - [373, 9620.39] + - [468, 9620.39] - - [704, 1856, 1, 3328] - - [374, 9033.75] + - [469, 9033.75] - - [4, 1408, 1, 128] - - [387, 24.455] + - [482, 24.455] - - [1024, 2368, 1, 256] - - [374, 7526.25] + - [469, 7526.25] - - [1408, 1856, 1, 1280] - - [377, 8324.19] + - [472, 8324.19] - - [1408, 64, 1, 1280] - - [348, 4681.24] + - [443, 4681.24] - - [448, 1024, 1, 1280] - - [374, 7112.53] + - [469, 7112.53] - - [256, 1408, 1, 3328] - - [380, 5825.51] + - [475, 5825.51] - - [5056, 5056, 1, 1280] - - [383, 9233.65] + - [478, 9233.65] - - [448, 5056, 1, 256] - - [375, 7003.27] + - [470, 7003.27] - - [704, 1856, 1, 1280] - - [374, 8877.38] + - [469, 8877.38] - - [128, 5056, 1, 128] - - [366, 2301.14] + - [461, 2301.14] - - [2368, 128, 1, 256] - - [374, 3849.04] + - [469, 3849.04] - - [1856, 1408, 1, 128] - - [369, 4202.31] + - [464, 4202.31] - - [64, 5056, 1, 256] - - [375, 3109.62] + - [470, 3109.62] - - [6784, 256, 1, 3328] - - [374, 6388.53] + - [469, 6388.53] - - [6784, 4288, 1, 3328] - - [385, 9114.67] + - [480, 9114.67] - - [4288, 448, 1, 256] - - [378, 5783.05] + - [473, 5783.05] - - [64, 704, 1, 128] - - [311, 379.519] + - [406, 379.519] - - [1856, 2368, 1, 3328] - - [374, 9128.46] + - [469, 9128.46] - - [4288, 2944, 1, 1280] - - [380, 9182.33] + - [475, 9182.33] - - [704, 5056, 1, 1280] - - [374, 9071.57] + - [469, 9071.57] - - [2368, 704, 1, 3328] - - [380, 7731.43] + - [475, 7731.43] - - [256, 5888, 1, 256] - - [374, 7920.38] + - [469, 7920.38] - - [1856, 4288, 1, 3328] - - [380, 9330.07] + - [475, 9330.07] - - [256, 2944, 1, 256] - - [381, 5312.27] + - [476, 5312.27] - - [5888, 1024, 1, 256] - - [372, 6710.97] + - [467, 6710.97] - - [448, 64, 1, 1280] - - [347, 2814.53] + - [442, 2814.53] - - [448, 5056, 1, 3328] - - [374, 8255.53] + - [469, 8255.53] - - [3584, 4, 1, 1280] - - [323, 640.815] + - [418, 640.815] - - [2944, 64, 1, 256] - - [322, 2621.54] + - [417, 2621.54] - - [128, 4, 1, 1280] - - [394, 86.3316] + - [489, 86.3316] - - [1408, 2944, 1, 256] - - [374, 8848.99] + - [469, 8848.99] - - [256, 1856, 1, 1280] - - [374, 7366.55] + - [469, 7366.55] - - [6784, 5056, 1, 3328] - - [385, 8332.16] + - [480, 8332.16] - - [5056, 5056, 1, 256] - - [380, 9171.74] + - [475, 9171.74] - - [1408, 6784, 1, 128] - - [366, 5079.19] + - [461, 5079.19] - - [64, 1024, 1, 1280] - - [338, 3679.31] + - [433, 3679.31] - - [2944, 4, 1, 256] - - [329, 369.543] + - [424, 369.543] - - [704, 5056, 1, 128] - - [366, 4509.27] + - [461, 4509.27] - - [4, 2368, 1, 1280] - - [323, 569.844] + - [418, 569.844] - - [2368, 2944, 1, 1280] - - [385, 7451.14] + - [480, 7451.14] - - [128, 3584, 1, 1280] - - [383, 6071.26] + - [478, 6071.26] - - [6784, 6784, 1, 1280] - - [380, 9535.74] + - [475, 9535.74] - - [1408, 4288, 1, 1280] - - [383, 8255.09] + - [478, 8255.09] - - [3584, 4288, 1, 1280] - - [385, 9651.19] + - [480, 9651.19] - - [2368, 704, 1, 1280] - - [380, 8291.4] + - [475, 8291.4] - - [5056, 4288, 1, 3328] - - [372, 9406.36] + - [467, 9406.36] - - [3584, 2368, 1, 3328] - - [380, 9350.32] + - [475, 9350.32] - - [64, 704, 1, 1280] - - [347, 3384.59] + - [442, 3384.59] - - [4288, 256, 1, 256] - - [380, 5593.62] + - [475, 5593.62] - - [2944, 128, 1, 128] - - [302, 2130.6] + - [397, 2130.6] - - [6784, 448, 1, 1280] - - [383, 8815.85] + - [478, 8815.85] - - [1408, 2944, 1, 128] - - [366, 4558.34] + - [461, 4558.34] - - [4288, 2944, 1, 256] - - [385, 7865.43] + - [480, 7865.43] - - [5888, 704, 1, 1280] - - [374, 9262.99] + - [469, 9262.99] - - [1856, 64, 1, 1280] - - [348, 4359.15] + - [443, 4359.15] - - [448, 5888, 1, 128] - - [369, 4000.59] + - [464, 4000.59] - - [5888, 64, 1, 3328] - - [349, 6603.39] + - [444, 6603.39] - - [2944, 256, 1, 3328] - - [374, 8423.63] + - [469, 8423.63] - - [1024, 64, 1, 128] - - [319, 582.642] + - [414, 582.642] - - [5056, 2368, 1, 1280] - - [374, 9419.91] + - [469, 9419.91] - - [448, 3584, 1, 1280] - - [374, 7985.82] + - [469, 7985.82] - - [6784, 5888, 1, 256] - - [372, 9494.36] + - [467, 9494.36] - - [704, 1024, 1, 128] - - [366, 2813.35] + - [461, 2813.35] - - [704, 128, 1, 1280] - - [348, 4477.71] + - [443, 4477.71] - - [5888, 2944, 1, 128] - - [369, 4745.96] + - [464, 4745.96] - - [4, 3584, 1, 128] - - [386, 96.479] + - [481, 96.479] - - [1408, 448, 1, 1280] - - [374, 6912.8] + - [469, 6912.8] - - [1024, 1408, 1, 256] - - [382, 5810.85] + - [477, 5810.85] - - [2368, 2368, 1, 3328] - - [383, 9088.71] + - [478, 9088.71] - - [1856, 6784, 1, 128] - - [369, 5168.32] + - [464, 5168.32] - - [5056, 704, 1, 3328] - - [375, 7464.9] + - [470, 7464.9] - - [1408, 1856, 1, 256] - - [380, 6727.69] + - [475, 6727.69] - - [1408, 704, 1, 3328] - - [380, 8379.53] + - [475, 8379.53] - - [2368, 5056, 1, 256] - - [380, 8664.11] + - [475, 8664.11] - - [5888, 1856, 1, 256] - - [385, 5810.02] + - [480, 5810.02] - - [4288, 64, 1, 3328] - - [362, 6583.94] + - [457, 6583.94] - - [2368, 4, 1, 1280] - - [395, 545.251] + - [490, 545.251] - - [704, 5888, 1, 256] - - [380, 8813.71] + - [475, 8813.71] - - [4288, 64, 1, 256] - - [338, 3059.97] + - [433, 3059.97] - - [6784, 64, 1, 256] - - [380, 3490.96] + - [475, 3490.96] - - [2944, 256, 1, 256] - - [374, 6970.4] + - [469, 6970.4] - - [2944, 6784, 1, 3328] - - [374, 9475.79] + - [469, 9475.79] - - [704, 1408, 1, 3328] - - [374, 8154.18] + - [469, 8154.18] - - [3584, 704, 1, 3328] - - [374, 8995.07] + - [469, 8995.07] - - [2944, 256, 1, 128] - - [366, 2824.13] + - [461, 2824.13] - - [6784, 4, 1, 1280] - - [323, 625.714] + - [418, 625.714] - - [1024, 64, 1, 1280] - - [335, 3307.91] + - [430, 3307.91] - - [448, 4288, 1, 256] - - [380, 6074.48] + - [475, 6074.48] - - [64, 3584, 1, 3328] - - [328, 6200.26] + - [423, 6200.26] - - [704, 2368, 1, 1280] - - [374, 8291.4] + - [469, 8291.4] - - [448, 2944, 1, 128] - - [366, 3221.87] + - [461, 3221.87] - - [1856, 2368, 1, 1280] - - [385, 6855.24] + - [480, 6855.24] - - [2368, 128, 1, 3328] - - [336, 6479.61] + - [431, 6479.61] - - [2944, 128, 1, 256] - - [374, 3828.23] + - [469, 3828.23] - - [448, 1408, 1, 256] - - [375, 4525.9] + - [470, 4525.9] - - [1856, 4288, 1, 1280] - - [373, 9160.32] + - [468, 9160.32] - - [64, 5056, 1, 3328] - - [356, 6819.3] + - [451, 6819.3] - - [4, 704, 1, 256] - - [340, 123.541] + - [435, 123.541] - - [1024, 448, 1, 128] - - [369, 1989.27] + - [464, 1989.27] - - [704, 4, 1, 1280] - - [343, 381.931] + - [438, 381.931] - - [704, 256, 1, 128] - - [366, 1109.17] + - [461, 1109.17] - - [704, 2944, 1, 128] - - [366, 4089.03] + - [461, 4089.03] - - [1408, 1024, 1, 1280] - - [380, 8192.08] + - [475, 8192.08] - - [704, 6784, 1, 256] - - [374, 6717.9] + - [469, 6717.9] - - [6784, 704, 1, 256] - - [380, 5429.22] + - [475, 5429.22] - - [5056, 1408, 1, 128] - - [366, 4954.5] + - [461, 4954.5] - - [256, 3584, 1, 3328] - - [374, 7890.96] + - [469, 7890.96] - - [4, 5888, 1, 3328] - - [391, 691.047] + - [486, 691.047] - - [128, 1408, 1, 128] - - [313, 1393.14] + - [408, 1393.14] - - [3584, 4288, 1, 3328] - - [376, 8900.87] + - [471, 8900.87] - - [5888, 1856, 1, 1280] - - [377, 9345.85] + - [472, 9345.85] - - [5056, 1024, 1, 3328] - - [378, 7834.84] + - [473, 7834.84] - - [5056, 64, 1, 1280] - - [356, 5890.14] + - [451, 5890.14] - - [1024, 704, 1, 256] - - [374, 6007.57] + - [469, 6007.57] - - [1024, 4288, 1, 128] - - [368, 3497.09] + - [463, 3497.09] - - [4288, 64, 1, 1280] - - [353, 4726.59] + - [448, 4726.59] - - [2368, 3584, 1, 1280] - - [372, 8128.82] + - [467, 8128.82] - - [2368, 6784, 1, 1280] - - [372, 9478.72] + - [467, 9478.72] - - [1024, 256, 1, 256] - - [380, 4092.1] + - [475, 4092.1] - - [1856, 4, 1, 1280] - - [395, 509.903] + - [490, 509.903] - - [448, 448, 1, 256] - - [380, 3001.28] + - [475, 3001.28] - - [2944, 3584, 1, 3328] - - [381, 9081.91] + - [476, 9081.91] - - [128, 4288, 1, 128] - - [301, 2323.33] + - [396, 2323.33] - - [64, 448, 1, 256] - - [344, 1066.97] + - [439, 1066.97] - - [128, 1024, 1, 3328] - - [357, 6392.36] + - [452, 6392.36] - - [4, 1408, 1, 3328] - - [340, 616.656] + - [435, 616.656] - - [6784, 2944, 1, 256] - - [383, 8547.73] + - [478, 8547.73] - - [64, 1856, 1, 1280] - - [356, 4409.71] + - [451, 4409.71] - - [64, 1024, 1, 128] - - [300, 554.902] + - [395, 554.902] - - [4288, 2368, 1, 3328] - - [376, 8780.08] + - [471, 8780.08] - - [1856, 2368, 1, 256] - - [383, 4976.74] + - [478, 4976.74] - - [3584, 256, 1, 128] - - [368, 2812.37] + - [463, 2812.37] - - [3584, 6784, 1, 3328] - - [378, 9278.22] + - [473, 9278.22] - - [256, 1024, 1, 256] - - [374, 4346.53] + - [469, 4346.53] - - [4, 6784, 1, 3328] - - [393, 681.366] + - [488, 681.366] - - [1024, 5888, 1, 3328] - - [374, 9187.61] + - [469, 9187.61] - - [1024, 128, 1, 1280] - - [326, 3660.05] + - [421, 3660.05] - - [4288, 128, 1, 1280] - - [380, 6019.17] + - [475, 6019.17] - - [5056, 4288, 1, 1280] - - [372, 9343.96] + - [467, 9343.96] - - [5888, 64, 1, 256] - - [374, 4692.17] + - [469, 4692.17] - - [1856, 256, 1, 1280] - - [380, 4790.38] + - [475, 4790.38] - - [64, 5888, 1, 3328] - - [348, 6702.2] + - [443, 6702.2] - - [2944, 5888, 1, 128] - - [369, 5202.65] + - [464, 5202.65] - - [704, 5888, 1, 1280] - - [374, 9264.29] + - [469, 9264.29] - - [2368, 3584, 1, 128] - - [366, 5053.71] + - [461, 5053.71] - - [6784, 5888, 1, 3328] - - [372, 7926.8] + - [467, 7926.8] - - [704, 1024, 1, 1280] - - [373, 5402.6] + - [468, 5402.6] - - [448, 256, 1, 3328] - - [356, 6124.65] + - [451, 6124.65] - - [448, 1856, 1, 128] - - [367, 2885.96] + - [462, 2885.96] - - [128, 1024, 1, 128] - - [301, 1013.22] + - [396, 1013.22] - - [2944, 4, 1, 128] - - [386, 77.6374] + - [481, 77.6374] - - [1024, 704, 1, 1280] - - [374, 7365.58] + - [469, 7365.58] - - [128, 5888, 1, 256] - - [374, 6990.61] + - [469, 6990.61] - - [1024, 5056, 1, 1280] - - [379, 9422.0] + - [474, 9422.0] - - [4288, 1024, 1, 256] - - [381, 6270.03] + - [476, 6270.03] - - [2944, 2368, 1, 128] - - [366, 4918.18] + - [461, 4918.18] - - [704, 704, 1, 3328] - - [374, 7963.65] + - [469, 7963.65] - - [704, 1408, 1, 1280] - - [374, 8347.32] + - [469, 8347.32] - - [5888, 448, 1, 1280] - - [380, 5217.05] + - [475, 5217.05] - - [3584, 256, 1, 3328] - - [374, 7802.25] + - [469, 7802.25] - - [704, 5888, 1, 3328] - - [380, 8381.46] + - [475, 8381.46] - - [704, 1856, 1, 128] - - [366, 3598.38] + - [461, 3598.38] - - [128, 3584, 1, 3328] - - [336, 7161.11] + - [431, 7161.11] - - [6784, 2368, 1, 1280] - - [385, 9464.41] + - [480, 9464.41] - - [4, 4288, 1, 128] - - [386, 132.68] + - [481, 132.68] - - [128, 704, 1, 1280] - - [348, 4463.85] + - [443, 4463.85] - - [3584, 2944, 1, 256] - - [385, 8201.24] + - [480, 8201.24] - - [1856, 128, 1, 3328] - - [327, 6575.5] + - [422, 6575.5] - - [4, 64, 1, 1280] - - [343, 43.6745] + - [438, 43.6745] - - [4, 5056, 1, 3328] - - [323, 675.315] + - [418, 675.315] - - [128, 2944, 1, 1280] - - [327, 5916.99] + - [422, 5916.99] - - [2368, 1024, 1, 3328] - - [380, 8646.84] + - [475, 8646.84] - - [128, 256, 1, 3328] - - [361, 4130.85] + - [456, 4130.85] - - [1408, 5056, 1, 3328] - - [379, 9529.75] + - [474, 9529.75] - - [1856, 1856, 1, 3328] - - [378, 8114.99] + - [473, 8114.99] - - [3584, 128, 1, 256] - - [374, 5603.18] + - [469, 5603.18] - - [448, 1408, 1, 3328] - - [374, 7073.03] + - [469, 7073.03] - - [2368, 2368, 1, 256] - - [381, 7648.76] + - [476, 7648.76] - - [4288, 4288, 1, 1280] - - [376, 9244.11] + - [471, 9244.11] - - [64, 448, 1, 1280] - - [347, 2885.33] + - [442, 2885.33] - - [1408, 4288, 1, 256] - - [374, 8080.41] + - [469, 8080.41] - - [448, 4, 1, 256] - - [392, 84.4294] + - [487, 84.4294] - - [5888, 448, 1, 128] - - [369, 3540.8] + - [464, 3540.8] - - [448, 4, 1, 1280] - - [343, 322.257] + - [438, 322.257] - - [704, 6784, 1, 3328] - - [373, 8613.58] + - [468, 8613.58] - - [5888, 5888, 1, 1280] - - [380, 9502.05] + - [475, 9502.05] - - [5056, 1024, 1, 1280] - - [383, 9110.11] + - [478, 9110.11] - - [448, 5888, 1, 3328] - - [374, 8586.43] + - [469, 8586.43] - - [128, 4, 1, 128] - - [386, 4.27959] + - [481, 4.27959] - - [1024, 2944, 1, 1280] - - [382, 7096.53] + - [477, 7096.53] - - [5056, 5888, 1, 1280] - - [373, 9693.51] + - [468, 9693.51] - - [4288, 5888, 1, 128] - - [366, 5406.46] + - [461, 5406.46] - - [256, 3584, 1, 256] - - [374, 6908.37] + - [469, 6908.37] - - [1408, 3584, 1, 128] - - [366, 4645.69] + - [461, 4645.69] - - [256, 2944, 1, 3328] - - [377, 6284.4] + - [472, 6284.4] - - [448, 3584, 1, 128] - - [369, 3675.37] + - [464, 3675.37] - - [5888, 2944, 1, 1280] - - [379, 9628.9] + - [474, 9628.9] - - [4, 6784, 1, 1280] - - [323, 688.176] + - [418, 688.176] - - [2368, 5888, 1, 128] - - [366, 5273.96] + - [461, 5273.96] - - [64, 2944, 1, 128] - - [310, 1316.54] + - [405, 1316.54] - - [3584, 5888, 1, 256] - - [380, 9239.14] + - [475, 9239.14] - - [2368, 704, 1, 128] - - [369, 3537.65] + - [464, 3537.65] - - [3584, 2944, 1, 1280] - - [374, 9324.62] + - [469, 9324.62] - - [3584, 2368, 1, 128] - - [366, 4766.34] + - [461, 4766.34] - - [5056, 704, 1, 128] - - [366, 4487.95] + - [461, 4487.95] - - [448, 2368, 1, 128] - - [369, 2877.02] + - [464, 2877.02] - - [5056, 1408, 1, 3328] - - [385, 9515.97] + - [480, 9515.97] - - [1408, 704, 1, 256] - - [377, 6836.18] + - [472, 6836.18] - - [6784, 1024, 1, 3328] - - [372, 9309.65] + - [467, 9309.65] - - [6784, 2944, 1, 3328] - - [373, 9536.58] + - [468, 9536.58] - - [2944, 5056, 1, 3328] - - [374, 9526.25] + - [469, 9526.25] - - [1856, 1856, 1, 256] - - [374, 5239.24] + - [469, 5239.24] - - [1024, 5888, 1, 128] - - [366, 4006.28] + - [461, 4006.28] - - [2048, 7133, 1, 2048] - - [372, 9828.07] + - [467, 9828.07] - - [256, 4, 1, 128] - - [387, 4.38908] + - [482, 4.38908] - - [4288, 5888, 1, 1280] - - [382, 9202.83] + - [477, 9202.83] - - [4288, 4288, 1, 256] - - [377, 5521.18] + - [472, 5521.18] - - [448, 2944, 1, 3328] - - [380, 7724.53] + - [475, 7724.53] - - [4288, 1856, 1, 1280] - - [380, 8826.34] + - [475, 8826.34] - - [1856, 2944, 1, 3328] - - [374, 9194.9] + - [469, 9194.9] - - [256, 6784, 1, 3328] - - [374, 8740.33] + - [469, 8740.33] - - [64, 5888, 1, 256] - - [374, 4766.35] + - [469, 4766.35] - - [256, 5056, 1, 128] - - [366, 2937.6] + - [461, 2937.6] - - [5056, 1024, 1, 256] - - [385, 5467.91] + - [480, 5467.91] - - [704, 64, 1, 3328] - - [362, 4818.43] + - [457, 4818.43] - - [5056, 1856, 1, 3328] - - [379, 8861.69] + - [474, 8861.69] - - [4, 2944, 1, 3328] - - [329, 662.102] + - [424, 662.102] - - [4, 5056, 1, 256] - - [389, 494.121] + - [484, 494.121] - - [1856, 1408, 1, 256] - - [374, 8674.78] + - [469, 8674.78] - - [3584, 4, 1, 128] - - [386, 108.296] + - [481, 108.296] - - [448, 448, 1, 3328] - - [348, 6457.4] + - [443, 6457.4] - - [6784, 128, 1, 3328] - - [341, 7256.71] + - [436, 7256.71] - - [4288, 1408, 1, 128] - - [369, 4791.76] + - [464, 4791.76] - - [4288, 5056, 1, 256] - - [374, 8560.84] + - [469, 8560.84] - - [1408, 128, 1, 1280] - - [356, 5085.79] + - [451, 5085.79] - - [5056, 256, 1, 3328] - - [377, 7284.23] + - [472, 7284.23] - - [704, 704, 1, 256] - - [374, 6171.19] + - [469, 6171.19] - - [1024, 5888, 1, 1280] - - [379, 8852.89] + - [474, 8852.89] - - [6784, 2368, 1, 128] - - [367, 4729.3] + - [462, 4729.3] - - [4, 5056, 1, 1280] - - [340, 670.046] + - [435, 670.046] - - [64, 128, 1, 256] - - [342, 369.317] + - [437, 369.317] - - [128, 1856, 1, 1280] - - [336, 5549.13] + - [431, 5549.13] - - [5056, 3584, 1, 256] - - [380, 7115.84] + - [475, 7115.84] - - [1856, 1024, 1, 1280] - - [372, 8196.5] + - [467, 8196.5] - - [6784, 4288, 1, 1280] - - [373, 9509.66] + - [468, 9509.66] - - [1856, 1856, 1, 1280] - - [375, 5791.99] + - [470, 5791.99] - - [6784, 2944, 1, 128] - - [366, 5317.12] + - [461, 5317.12] - - [1408, 5056, 1, 1280] - - [375, 8980.73] + - [470, 8980.73] - - [4, 2368, 1, 3328] - - [340, 592.634] + - [435, 592.634] - - [5888, 1856, 1, 128] - - [365, 4600.2] + - [460, 4600.2] - - [448, 704, 1, 1280] - - [374, 2286.58] + - [469, 2286.58] - - [2368, 1024, 1, 128] - - [369, 3911.12] + - [464, 3911.12] - - [1024, 448, 1, 3328] - - [374, 7295.24] + - [469, 7295.24] - - [1856, 704, 1, 1280] - - [374, 8881.12] + - [469, 8881.12] - - [5056, 3584, 1, 128] - - [366, 4911.68] + - [461, 4911.68] - - [5888, 5888, 1, 3328] - - [382, 9243.9] + - [477, 9243.9] - - [6784, 1024, 1, 256] - - [385, 5475.41] + - [480, 5475.41] - - [2944, 2368, 1, 256] - - [380, 5670.77] + - [475, 5670.77] - - [256, 448, 1, 256] - - [331, 2293.86] + - [426, 2293.86] - - [5056, 5888, 1, 3328] - - [375, 7848.07] + - [470, 7848.07] - - [1856, 1024, 1, 256] - - [380, 7517.7] + - [475, 7517.7] - - [448, 1408, 1, 1280] - - [374, 6917.54] + - [469, 6917.54] - - [3584, 448, 1, 1280] - - [380, 7980.86] + - [475, 7980.86] - - [1024, 1024, 1, 1280] - - [377, 8384.52] + - [472, 8384.52] - - [448, 5888, 1, 256] - - [374, 7365.75] + - [469, 7365.75] - - [704, 64, 1, 128] - - [319, 358.755] + - [414, 358.755] - - [1408, 6784, 1, 3328] - - [380, 9094.19] + - [475, 9094.19] - - [448, 1024, 1, 128] - - [369, 1773.05] + - [464, 1773.05] - - [4288, 704, 1, 128] - - [366, 4355.38] + - [461, 4355.38] - - [128, 1856, 1, 128] - - [305, 1610.73] + - [400, 1610.73] - - [448, 2368, 1, 3328] - - [380, 7366.47] + - [475, 7366.47] - - [5056, 64, 1, 128] - - [305, 2157.33] + - [400, 2157.33] - - [5056, 2944, 1, 256] - - [374, 9123.16] + - [469, 9123.16] - - [6784, 5888, 1, 128] - - [365, 5285.9] + - [460, 5285.9] - - [704, 1024, 1, 256] - - [380, 6667.35] + - [475, 6667.35] - - [1024, 4, 1, 256] - - [329, 187.346] + - [424, 187.346] - - [2368, 1856, 1, 256] - - [380, 6777.94] + - [475, 6777.94] - - [128, 6784, 1, 1280] - - [377, 7052.71] + - [472, 7052.71] - - [1408, 3584, 1, 3328] - - [381, 9038.05] + - [476, 9038.05] - - [2368, 6784, 1, 256] - - [374, 9181.45] + - [469, 9181.45] - - [5056, 1408, 1, 1280] - - [379, 9422.0] + - [474, 9422.0] - - [256, 256, 1, 128] - - [311, 543.404] + - [406, 543.404] - - [5056, 4288, 1, 128] - - [369, 5340.02] + - [464, 5340.02] - - [1408, 1856, 1, 128] - - [366, 4270.99] + - [461, 4270.99] - - [1408, 5888, 1, 3328] - - [378, 9034.89] + - [473, 9034.89] - - [1856, 256, 1, 256] - - [380, 5847.93] + - [475, 5847.93] - - [6784, 6784, 1, 256] - - [373, 9624.48] + - [468, 9624.48] - - [64, 256, 1, 128] - - [312, 146.549] + - [407, 146.549] - - [4288, 2368, 1, 128] - - [365, 3897.04] + - [460, 3897.04] - - [1856, 4288, 1, 128] - - [366, 4337.17] + - [461, 4337.17] - - [256, 4288, 1, 1280] - - [374, 7499.52] + - [469, 7499.52] - - [2368, 2944, 1, 256] - - [379, 7703.28] + - [474, 7703.28] - - [4, 1856, 1, 256] - - [392, 264.064] + - [487, 264.064] - - [3584, 1856, 1, 1280] - - [374, 9224.43] + - [469, 9224.43] - - [6784, 6784, 1, 128] - - [366, 5476.13] + - [461, 5476.13] - - [256, 1856, 1, 128] - - [369, 1858.82] + - [464, 1858.82] - - [704, 64, 1, 1280] - - [347, 3368.77] + - [442, 3368.77] - - [5888, 5056, 1, 256] - - [380, 5859.91] + - [475, 5859.91] - - [3584, 448, 1, 256] - - [380, 7298.43] + - [475, 7298.43] - - [448, 4288, 1, 128] - - [366, 3813.55] + - [461, 3813.55] - - [2944, 4288, 1, 3328] - - [375, 9149.73] + - [470, 9149.73] - - [256, 6784, 1, 256] - - [374, 7984.95] + - [469, 7984.95] - - [1408, 4288, 1, 128] - - [369, 4728.44] + - [464, 4728.44] - - [2944, 704, 1, 3328] - - [380, 7149.86] + - [475, 7149.86] - - [128, 448, 1, 256] - - [346, 1699.18] + - [441, 1699.18] - - [512, 32, 1, 512] - - [346, 1127.6] + - [441, 1127.6] - - [3584, 3584, 1, 256] - - [375, 8558.11] + - [470, 8558.11] - - [448, 1408, 1, 128] - - [366, 2504.45] + - [461, 2504.45] - - [128, 256, 1, 1280] - - [347, 3216.59] + - [442, 3216.59] - - [3584, 5056, 1, 256] - - [372, 5674.45] + - [467, 5674.45] - - [6784, 128, 1, 256] - - [374, 6216.49] + - [469, 6216.49] - - [4288, 4, 1, 256] - - [390, 435.706] + - [485, 435.706] - - [64, 1408, 1, 3328] - - [348, 6186.01] + - [443, 6186.01] - - [704, 448, 1, 256] - - [380, 4005.08] + - [475, 4005.08] - - [2944, 2368, 1, 1280] - - [381, 8542.8] + - [476, 8542.8] - - [448, 64, 1, 3328] - - [361, 3835.33] + - [456, 3835.33] - - [1408, 3584, 1, 256] - - [374, 8714.63] + - [469, 8714.63] - - [3584, 4, 1, 3328] - - [329, 689.554] + - [424, 689.554] - - [6784, 3584, 1, 256] - - [379, 9271.34] + - [474, 9271.34] - - [256, 128, 1, 128] - - [312, 283.499] + - [407, 283.499] - - [704, 1408, 1, 128] - - [366, 3210.57] + - [461, 3210.57] - - [4, 2368, 1, 256] - - [392, 360.938] + - [487, 360.938] - - [2944, 448, 1, 128] - - [366, 3344.41] + - [461, 3344.41] - - [128, 1408, 1, 256] - - [374, 3186.38] + - [469, 3186.38] - - [4, 2944, 1, 256] - - [390, 384.622] + - [485, 384.622] - - [64, 128, 1, 3328] - - [343, 2103.72] + - [438, 2103.72] - - [5056, 2368, 1, 128] - - [366, 5219.76] + - [461, 5219.76] - - [2944, 2944, 1, 3328] - - [383, 9174.69] + - [478, 9174.69] - - [5056, 6784, 1, 256] - - [385, 8992.36] + - [480, 8992.36] - - [1856, 3584, 1, 128] - - [366, 4957.27] + - [461, 4957.27] - - [128, 2944, 1, 128] - - [304, 2241.48] + - [399, 2241.48] - - [1024, 704, 1, 3328] - - [384, 6545.11] + - [479, 6545.11] - - [6784, 448, 1, 256] - - [380, 5379.25] + - [475, 5379.25] - - [3584, 6784, 1, 128] - - [366, 5102.01] + - [461, 5102.01] - - [128, 4288, 1, 256] - - [374, 5211.86] + - [469, 5211.86] - - [704, 448, 1, 3328] - - [375, 4504.15] + - [470, 4504.15] - - [1024, 1024, 1, 3328] - - [377, 8009.77] + - [472, 8009.77] - - [128, 128, 1, 3328] - - [360, 3185.03] + - [455, 3185.03] - - [5056, 1856, 1, 256] - - [374, 9138.43] + - [469, 9138.43] - - [256, 128, 1, 256] - - [346, 1205.36] + - [441, 1205.36] - - [1024, 1856, 1, 256] - - [385, 6375.09] + - [480, 6375.09] - - [4288, 64, 1, 128] - - [302, 1695.43] + - [397, 1695.43] - - [256, 448, 1, 3328] - - [349, 5659.67] + - [444, 5659.67] - - [1408, 6784, 1, 1280] - - [374, 9349.2] + - [469, 9349.2] - - [3584, 3584, 1, 1280] - - [379, 9302.19] + - [474, 9302.19] - - [64, 2368, 1, 1280] - - [348, 4433.07] + - [443, 4433.07] - - [448, 2368, 1, 1280] - - [374, 7250.77] + - [469, 7250.77] - - [5888, 5888, 1, 128] - - [366, 4616.03] + - [461, 4616.03] - - [64, 6784, 1, 3328] - - [380, 6987.23] + - [475, 6987.23] - - [2944, 256, 1, 1280] - - [383, 6127.45] + - [478, 6127.45] - - [5056, 5888, 1, 128] - - [365, 5106.39] + - [460, 5106.39] - - [256, 2368, 1, 128] - - [366, 2141.23] + - [461, 2141.23] - - [5056, 2368, 1, 3328] - - [377, 9041.75] + - [472, 9041.75] - - [2944, 4288, 1, 256] - - [385, 8691.22] + - [480, 8691.22] - - [1408, 3584, 1, 1280] - - [374, 9070.0] + - [469, 9070.0] - - [2368, 64, 1, 256] - - [346, 2412.87] + - [441, 2412.87] - - [64, 448, 1, 3328] - - [361, 3739.14] + - [456, 3739.14] - - [256, 256, 1, 3328] - - [348, 5304.18] + - [443, 5304.18] - - [5888, 4, 1, 128] - - [387, 105.655] + - [482, 105.655] - - [1856, 704, 1, 256] - - [374, 8025.43] + - [469, 8025.43] - - [4, 4288, 1, 1280] - - [321, 579.07] + - [416, 579.07] - - [1408, 448, 1, 3328] - - [382, 5714.51] + - [477, 5714.51] - - [1024, 4, 1, 3328] - - [340, 608.649] + - [435, 608.649] - - [2368, 256, 1, 256] - - [380, 5173.08] + - [475, 5173.08] - - [2368, 6784, 1, 3328] - - [380, 9456.61] + - [475, 9456.61] - - [1856, 1408, 1, 1280] - - [385, 7805.19] + - [480, 7805.19] - - [1856, 448, 1, 1280] - - [372, 6185.04] + - [467, 6185.04] - - [6784, 704, 1, 128] - - [366, 4597.87] + - [461, 4597.87] - - [4, 4, 1, 256] - - [343, 0.791892] + - [438, 0.791892] - - [128, 5888, 1, 128] - - [304, 2691.76] + - [399, 2691.76] - - [1408, 5888, 1, 256] - - [379, 7164.27] + - [474, 7164.27] - - [704, 2944, 1, 1280] - - [381, 8139.81] + - [476, 8139.81] - - [1856, 2368, 1, 128] - - [369, 4623.38] + - [464, 4623.38] - - [4096, 7133, 1, 4096] - - [373, 9940.07] + - [468, 9940.07] - - [256, 64, 1, 256] - - [337, 689.953] + - [432, 689.953] - - [1024, 1024, 1, 256] - - [380, 7216.11] + - [475, 7216.11] - - [704, 1856, 1, 256] - - [380, 6364.17] + - [475, 6364.17] - - [128, 4288, 1, 3328] - - [336, 7200.59] + - [431, 7200.59] - - [3584, 704, 1, 1280] - - [383, 7972.08] + - [478, 7972.08] - - [256, 128, 1, 1280] - - [334, 2702.62] + - [429, 2702.62] - - [2368, 4, 1, 256] - - [329, 326.018] + - [424, 326.018] - - [256, 2368, 1, 1280] - - [374, 6638.93] + - [469, 6638.93] - - [2944, 6784, 1, 128] - - [365, 5233.53] + - [460, 5233.53] - - [3584, 448, 1, 3328] - - [374, 8094.4] + - [469, 8094.4] - - [1408, 4, 1, 256] - - [392, 243.646] + - [487, 243.646] - - [704, 2368, 1, 3328] - - [374, 8403.11] + - [469, 8403.11] - - [2944, 448, 1, 256] - - [374, 7022.59] + - [469, 7022.59] - - [1856, 448, 1, 128] - - [369, 2842.79] + - [464, 2842.79] - - [2368, 128, 1, 1280] - - [356, 5685.52] + - [451, 5685.52] - - [256, 5888, 1, 128] - - [371, 2178.71] + - [466, 2178.71] - - [64, 6784, 1, 256] - - [374, 5385.23] + - [469, 5385.23] - - [64, 5056, 1, 1280] - - [348, 5603.29] + - [443, 5603.29] - - [4, 6784, 1, 128] - - [386, 180.256] + - [481, 180.256] - - [2944, 2944, 1, 1280] - - [383, 9129.39] + - [478, 9129.39] - - [5888, 2368, 1, 256] - - [385, 6961.69] + - [480, 6961.69] - - [4, 3584, 1, 1280] - - [329, 646.23] + - [424, 646.23] - - [1408, 128, 1, 128] - - [315, 1172.29] + - [410, 1172.29] - - [6784, 704, 1, 3328] - - [380, 9084.62] + - [475, 9084.62] - - [128, 64, 1, 1280] - - [359, 1260.41] + - [454, 1260.41] - - [2368, 256, 1, 1280] - - [380, 6643.48] + - [475, 6643.48] - - [4, 448, 1, 3328] - - [343, 433.514] + - [438, 433.514] - - [5888, 4288, 1, 128] - - [367, 4753.17] + - [462, 4753.17] - - [4, 5888, 1, 256] - - [329, 471.14] + - [424, 471.14] - - [1408, 2944, 1, 3328] - - [383, 9207.1] + - [478, 9207.1] - - [3584, 704, 1, 128] - - [369, 3762.46] + - [464, 3762.46] - - [64, 1024, 1, 256] - - [347, 1807.99] + - [442, 1807.99] - - [5056, 5056, 1, 128] - - [370, 4830.16] + - [465, 4830.16] - - [2368, 448, 1, 1280] - - [374, 7263.16] + - [469, 7263.16] - - [128, 3584, 1, 256] - - [377, 4369.17] + - [472, 4369.17] - - [704, 448, 1, 1280] - - [375, 4205.33] + - [470, 4205.33] - - [448, 5056, 1, 128] - - [366, 3855.57] + - [461, 3855.57] - - [256, 4, 1, 1280] - - [397, 157.638] + - [492, 157.638] - - [128, 5056, 1, 256] - - [380, 6109.06] + - [475, 6109.06] - - [1408, 5056, 1, 128] - - [369, 4836.68] + - [464, 4836.68] - - [2944, 3584, 1, 128] - - [369, 4532.19] + - [464, 4532.19] - - [3584, 2368, 1, 256] - - [374, 8951.34] + - [469, 8951.34] - - [5888, 5056, 1, 1280] - - [385, 9276.49] + - [480, 9276.49] - - [2368, 5056, 1, 128] - - [369, 5167.66] + - [464, 5167.66] - - [64, 704, 1, 256] - - [329, 1501.97] + - [424, 1501.97] - - [4288, 256, 1, 1280] - - [374, 7496.3] + - [469, 7496.3] - - [3584, 3584, 1, 3328] - - [375, 9301.77] + - [470, 9301.77] - - [1024, 256, 1, 128] - - [366, 1508.84] + - [461, 1508.84] - - [4, 704, 1, 128] - - [387, 12.1469] + - [482, 12.1469] - - [5888, 6784, 1, 256] - - [373, 9370.47] + - [468, 9370.47] - - [4288, 2944, 1, 3328] - - [377, 9149.09] + - [472, 9149.09] - - [2944, 64, 1, 128] - - [313, 1456.46] + - [408, 1456.46] - - [1856, 64, 1, 256] - - [339, 2210.03] + - [434, 2210.03] - - [4288, 128, 1, 3328] - - [333, 6471.95] + - [428, 6471.95] - - [4288, 704, 1, 1280] - - [380, 8934.61] + - [475, 8934.61] - - [256, 5056, 1, 1280] - - [374, 8439.13] + - [469, 8439.13] - - [1408, 256, 1, 128] - - [369, 1769.17] + - [464, 1769.17] - - [2944, 5888, 1, 3328] - - [374, 9448.04] + - [469, 9448.04] - - [6784, 5888, 1, 1280] - - [385, 9372.25] + - [480, 9372.25] - - [704, 128, 1, 256] - - [331, 2059.8] + - [426, 2059.8] - - [5888, 4288, 1, 1280] - - [377, 9244.32] + - [472, 9244.32] - - [448, 256, 1, 1280] - - [356, 4741.72] + - [451, 4741.72] - - [5888, 3584, 1, 128] - - [365, 4980.06] + - [460, 4980.06] - - [1856, 1856, 1, 128] - - [369, 4363.98] + - [464, 4363.98] - - [5056, 4, 1, 1280] - - [389, 629.641] + - [484, 629.641] - - [256, 1408, 1, 1280] - - [380, 5588.44] + - [475, 5588.44] - - [512, 16, 1, 512] - - [340, 689.953] + - [435, 689.953] - - [704, 3584, 1, 128] - - [369, 4069.67] + - [464, 4069.67] - - [5888, 448, 1, 3328] - - [385, 7925.94] + - [480, 7925.94] - - [2368, 4288, 1, 1280] - - [384, 8492.7] + - [479, 8492.7] - - [4288, 2944, 1, 128] - - [366, 5238.21] + - [461, 5238.21] - - [1024, 6784, 1, 3328] - - [380, 8578.18] + - [475, 8578.18] - - [128, 2368, 1, 256] - - [380, 3788.9] + - [475, 3788.9] - - [6784, 64, 1, 3328] - - [374, 7003.46] + - [469, 7003.46] - - [5056, 2944, 1, 3328] - - [377, 8575.45] + - [472, 8575.45] - - [448, 128, 1, 256] - - [329, 1715.06] + - [424, 1715.06] - - [2944, 3584, 1, 256] - - [374, 8994.26] + - [469, 8994.26] - - [1408, 1408, 1, 3328] - - [372, 8757.7] + - [467, 8757.7] - - [1856, 128, 1, 1280] - - [374, 5598.17] + - [469, 5598.17] - - [3584, 3584, 1, 128] - - [365, 4787.44] + - [460, 4787.44] - - [64, 3584, 1, 256] - - [380, 3546.01] + - [475, 3546.01] - - [1408, 4, 1, 3328] - - [324, 640.24] + - [419, 640.24] - - [128, 2944, 1, 3328] - - [348, 7204.24] + - [443, 7204.24] - - [3584, 704, 1, 256] - - [374, 6239.69] + - [469, 6239.69] - - [2944, 448, 1, 3328] - - [380, 7726.71] + - [475, 7726.71] - - [3584, 1408, 1, 3328] - - [372, 9358.78] + - [467, 9358.78] - - [704, 3584, 1, 1280] - - [380, 8005.28] + - [475, 8005.28] - - [2944, 6784, 1, 1280] - - [372, 9487.73] + - [467, 9487.73] - - [1856, 6784, 1, 256] - - [374, 5684.56] + - [469, 5684.56] - - [4288, 448, 1, 3328] - - [380, 8410.38] + - [475, 8410.38] - - [6784, 4288, 1, 128] - - [370, 4785.58] + - [465, 4785.58] - - [6784, 704, 1, 1280] - - [374, 5579.05] + - [469, 5579.05] - - [256, 4288, 1, 256] - - [374, 6781.43] + - [469, 6781.43] - - [3584, 64, 1, 128] - - [313, 1474.0] + - [408, 1474.0] - - [5888, 1024, 1, 3328] - - [372, 8639.49] + - [467, 8639.49] - - [448, 64, 1, 128] - - [304, 259.282] + - [399, 259.282] - - [704, 6784, 1, 1280] - - [380, 9027.25] + - [475, 9027.25] - - [5888, 128, 1, 256] - - [380, 6812.88] + - [475, 6812.88] - - [2368, 448, 1, 3328] - - [380, 7356.63] + - [475, 7356.63] - - [1856, 5056, 1, 3328] - - [379, 8871.56] + - [474, 8871.56] - - [4, 6784, 1, 256] - - [388, 469.479] + - [483, 469.479] - - [1024, 3584, 1, 128] - - [366, 3428.02] + - [461, 3428.02] - - [1024, 1408, 1, 128] - - [369, 2935.05] + - [464, 2935.05] - - [2368, 2944, 1, 128] - - [369, 4888.02] + - [464, 4888.02] - - [5056, 64, 1, 256] - - [338, 3186.16] + - [433, 3186.16] - - [4, 448, 1, 1280] - - [343, 273.167] + - [438, 273.167] - - [5056, 2944, 1, 128] - - [370, 4752.79] + - [465, 4752.79] - - [5888, 5056, 1, 3328] - - [384, 9124.77] + - [479, 9124.77] - - [1024, 704, 1, 128] - - [369, 2302.36] + - [464, 2302.36] - - [1408, 2368, 1, 128] - - [369, 3826.95] + - [464, 3826.95] - - [5888, 2368, 1, 128] - - [366, 4912.77] + - [461, 4912.77] - - [128, 5056, 1, 3328] - - [356, 7583.8] + - [451, 7583.8] - - [3584, 6784, 1, 1280] - - [383, 9313.5] + - [478, 9313.5] - - [3072, 7435, 1, 1024] - - [377, 9322.07] + - [472, 9322.07] - - [1856, 5888, 1, 256] - - [374, 5778.34] + - [469, 5778.34] - - [256, 256, 1, 256] - - [326, 1576.91] + - [421, 1576.91] - - [256, 64, 1, 128] - - [312, 173.705] + - [407, 173.705] - - [4288, 4288, 1, 3328] - - [379, 8416.27] + - [474, 8416.27] - - [4288, 1408, 1, 1280] - - [385, 9301.97] + - [480, 9301.97] - - [3584, 5056, 1, 128] - - [371, 4344.94] + - [466, 4344.94] - - [4, 1024, 1, 3328] - - [340, 615.239] + - [435, 615.239] - - [4288, 2368, 1, 256] - - [374, 9142.67] + - [469, 9142.67] - - [2944, 5056, 1, 1280] - - [374, 9399.69] + - [469, 9399.69] - - [448, 6784, 1, 256] - - [373, 5710.93] + - [468, 5710.93] - - [64, 1024, 1, 3328] - - [356, 4975.1] + - [451, 4975.1] - - [6784, 2368, 1, 3328] - - [383, 9207.63] + - [478, 9207.63] - - [256, 1024, 1, 1280] - - [380, 5983.42] + - [475, 5983.42] - - [704, 4, 1, 128] - - [386, 15.1187] + - [481, 15.1187] - - [256, 4, 1, 256] - - [343, 52.9516] + - [438, 52.9516] - - [4288, 128, 1, 256] - - [374, 5242.98] + - [469, 5242.98] - - [4288, 1856, 1, 3328] - - [385, 9354.06] + - [480, 9354.06] - - [3584, 448, 1, 128] - - [366, 3353.9] + - [461, 3353.9] - - [256, 4, 1, 3328] - - [397, 313.324] + - [492, 313.324] - - [4, 1408, 1, 1280] - - [340, 509.207] + - [435, 509.207] - - [3584, 64, 1, 1280] - - [328, 5198.42] + - [423, 5198.42] - - [1408, 448, 1, 128] - - [366, 2628.37] + - [461, 2628.37] - - [3584, 1024, 1, 1280] - - [380, 8535.01] + - [475, 8535.01] - - [1856, 5056, 1, 256] - - [372, 8184.49] + - [467, 8184.49] - - [4, 3584, 1, 256] - - [390, 395.576] + - [485, 395.576] - - [1024, 4288, 1, 256] - - [375, 5966.52] + - [470, 5966.52] - - [5888, 3584, 1, 3328] - - [378, 9189.43] + - [473, 9189.43] - - [4, 256, 1, 256] - - [394, 41.5785] + - [489, 41.5785] - - [5056, 3584, 1, 3328] - - [379, 9431.92] + - [474, 9431.92] - - [128, 5888, 1, 1280] - - [374, 8192.1] + - [469, 8192.1] - - [704, 448, 1, 128] - - [366, 1510.96] + - [461, 1510.96] - - [2368, 1408, 1, 1280] - - [374, 8415.65] + - [469, 8415.65] - - [5056, 2944, 1, 1280] - - [385, 9294.77] + - [480, 9294.77] - - [4, 4, 1, 128] - - [387, 0.1356549] + - [482, 0.1356549] - - [3584, 256, 1, 256] - - [374, 6749.55] + - [469, 6749.55] - - [128, 1856, 1, 3328] - - [327, 6797.09] + - [422, 6797.09] - - [1024, 6784, 1, 256] - - [380, 8783.09] + - [475, 8783.09] - - [4, 128, 1, 256] - - [340, 27.4067] + - [435, 27.4067] - - [64, 64, 1, 1280] - - [359, 712.448] + - [454, 712.448] - - [6784, 4, 1, 128] - - [387, 122.06] + - [482, 122.06] - - [2944, 1408, 1, 128] - - [369, 4430.46] + - [464, 4430.46] - - [448, 128, 1, 3328] - - [356, 5097.34] + - [451, 5097.34] - - [64, 2944, 1, 3328] - - [356, 6362.2] + - [451, 6362.2] - - [64, 4288, 1, 3328] - - [356, 6565.01] + - [451, 6565.01] - - [5056, 6784, 1, 3328] - - [380, 8121.18] + - [475, 8121.18] - - [128, 2944, 1, 256] - - [374, 4692.17] + - [469, 4692.17] - - [128, 6784, 1, 128] - - [303, 2687.46] + - [398, 2687.46] - - [3584, 4288, 1, 256] - - [380, 9193.99] + - [475, 9193.99] - - [448, 1856, 1, 256] - - [380, 6231.39] + - [475, 6231.39] - - [1856, 6784, 1, 3328] - - [385, 9191.48] + - [480, 9191.48] - - [3584, 128, 1, 3328] - - [374, 7368.47] + - [469, 7368.47] - - [64, 1856, 1, 256] - - [325, 2184.63] + - [420, 2184.63] - - [1024, 448, 1, 1280] - - [380, 6977.32] + - [475, 6977.32] - - [5888, 4288, 1, 256] - - [380, 5780.5] + - [475, 5780.5] - - [4, 448, 1, 128] - - [387, 9.06] + - [482, 9.06] - - [5056, 1408, 1, 256] - - [374, 5601.35] + - [469, 5601.35] - - [64, 256, 1, 1280] - - [340, 1927.63] + - [435, 1927.63] - - [3584, 1024, 1, 256] - - [385, 7542.84] + - [480, 7542.84] - - [256, 704, 1, 256] - - [374, 2957.62] + - [469, 2957.62] - - [5888, 5888, 1, 256] - - [385, 7344.14] + - [480, 7344.14] - - [4288, 1024, 1, 1280] - - [380, 8925.84] + - [475, 8925.84] - - [5888, 128, 1, 3328] - - [374, 8410.07] + - [469, 8410.07] - - [448, 6784, 1, 3328] - - [374, 8862.56] + - [469, 8862.56] - - [2944, 1408, 1, 1280] - - [385, 7478.93] + - [480, 7478.93] - - [1024, 32, 1, 512] - - [329, 1777.35] + - [424, 1777.35] - - [2944, 1856, 1, 3328] - - [374, 9153.43] + - [469, 9153.43] - - [2368, 64, 1, 128] - - [313, 1102.3] + - [408, 1102.3] - - [2944, 2944, 1, 128] - - [365, 4591.95] + - [460, 4591.95] - - [4, 128, 1, 3328] - - [395, 119.09] + - [490, 119.09] - - [3584, 5888, 1, 1280] - - [374, 9222.49] + - [469, 9222.49] - - [64, 4, 1, 128] - - [386, 1.03516] + - [481, 1.03516] - - [6784, 1856, 1, 1280] - - [374, 9136.07] + - [469, 9136.07] - - [2944, 5056, 1, 256] - - [380, 8860.13] + - [475, 8860.13] - - [2944, 5888, 1, 1280] - - [373, 9643.63] + - [468, 9643.63] - - [5888, 256, 1, 3328] - - [380, 8799.53] + - [475, 8799.53] - - [1856, 5888, 1, 3328] - - [380, 9457.53] + - [475, 9457.53] - - [3584, 1408, 1, 256] - - [380, 8672.53] + - [475, 8672.53] - - [704, 3584, 1, 3328] - - [380, 8525.3] + - [475, 8525.3] - - [5056, 448, 1, 1280] - - [380, 8843.77] + - [475, 8843.77] - - [3584, 1856, 1, 3328] - - [372, 8881.53] + - [467, 8881.53] - - [64, 1408, 1, 128] - - [301, 747.142] + - [396, 747.142] - - [1408, 704, 1, 1280] - - [374, 8342.93] + - [469, 8342.93] - - [2944, 1024, 1, 256] - - [385, 8079.58] + - [480, 8079.58] - - [1024, 2368, 1, 128] - - [369, 3347.58] + - [464, 3347.58] - - [2368, 4288, 1, 3328] - - [380, 9467.67] + - [475, 9467.67] - - [4, 1408, 1, 256] - - [392, 257.563] + - [487, 257.563] - - [1024, 1408, 1, 1280] - - [380, 8241.84] + - [475, 8241.84] - - [64, 64, 1, 256] - - [340, 190.059] + - [435, 190.059] - - [704, 256, 1, 3328] - - [374, 4519.28] + - [469, 4519.28] - - [6784, 5056, 1, 256] - - [373, 9133.78] + - [468, 9133.78] - - [4, 4288, 1, 3328] - - [324, 670.075] + - [419, 670.075] - - [448, 6784, 1, 128] - - [366, 4481.92] + - [461, 4481.92] - - [4, 704, 1, 3328] - - [396, 523.071] + - [491, 523.071] - - [448, 2944, 1, 256] - - [374, 7022.59] + - [469, 7022.59] - - [2944, 6784, 1, 256] - - [380, 9199.84] + - [475, 9199.84] - - [2368, 2368, 1, 1280] - - [385, 8646.84] + - [480, 8646.84] - - [4, 4, 1, 1280] - - [343, 3.11176] + - [438, 3.11176] - - [1856, 3584, 1, 1280] - - [372, 8805.45] + - [467, 8805.45] - - [64, 2944, 1, 256] - - [346, 2565.76] + - [441, 2565.76] - - [3584, 1408, 1, 1280] - - [385, 9273.12] + - [480, 9273.12] - - [448, 256, 1, 128] - - [301, 941.13] + - [396, 941.13] - - [4288, 448, 1, 128] - - [367, 3215.2] + - [462, 3215.2] - - [5056, 256, 1, 1280] - - [380, 8790.13] + - [475, 8790.13] - - [1856, 1408, 1, 3328] - - [374, 9310.73] + - [469, 9310.73] - - [128, 128, 1, 128] - - [309, 155.215] + - [404, 155.215] - - [1024, 4288, 1, 3328] - - [377, 8528.12] + - [472, 8528.12] - - [448, 2368, 1, 256] - - [381, 5097.34] + - [476, 5097.34] - - [1024, 4, 1, 128] - - [387, 10.3721] + - [482, 10.3721] - - [5056, 448, 1, 256] - - [380, 8236.78] + - [475, 8236.78] - - [2944, 2368, 1, 3328] - - [373, 9331.16] + - [468, 9331.16] - - [704, 128, 1, 3328] - - [348, 5969.3] + - [443, 5969.3] - - [64, 64, 1, 3328] - - [364, 1494.78] + - [459, 1494.78] - - [1024, 1856, 1, 1280] - - [379, 6356.43] + - [474, 6356.43] - - [6784, 1856, 1, 256] - - [380, 9068.63] + - [475, 9068.63] - - [128, 2368, 1, 3328] - - [356, 6714.22] + - [451, 6714.22] - - [1024, 5888, 1, 256] - - [380, 5501.6] + - [475, 5501.6] - - [5056, 128, 1, 1280] - - [336, 6455.64] + - [431, 6455.64] - - [5056, 64, 1, 3328] - - [341, 6703.81] + - [436, 6703.81] - - [128, 704, 1, 128] - - [302, 696.618] + - [397, 696.618] - - [1408, 2368, 1, 256] - - [374, 8667.25] + - [469, 8667.25] - - [1408, 1408, 1, 256] - - [385, 7615.81] + - [480, 7615.81] - - [4, 64, 1, 128] - - [387, 1.08463] + - [482, 1.08463] - - [64, 128, 1, 1280] - - [359, 1379.81] + - [454, 1379.81] - - [2368, 2368, 1, 128] - - [369, 4582.26] + - [464, 4582.26] - - [64, 5888, 1, 128] - - [302, 2086.37] + - [397, 2086.37] - - [5888, 4, 1, 3328] - - [323, 667.514] + - [418, 667.514] - - [6784, 1408, 1, 128] - - [370, 4516.34] + - [465, 4516.34] - - [4288, 5888, 1, 256] - - [385, 8497.43] + - [480, 8497.43] - - [1408, 5056, 1, 256] - - [374, 8867.46] + - [469, 8867.46] - - [5056, 128, 1, 3328] - - [356, 7678.98] + - [451, 7678.98] - - [128, 128, 1, 1280] - - [344, 2016.59] + - [439, 2016.59] - - [448, 704, 1, 256] - - [375, 3030.89] + - [470, 3030.89] - - [4288, 3584, 1, 128] - - [366, 5246.33] + - [461, 5246.33] - - [2944, 128, 1, 3328] - - [341, 6795.16] + - [436, 6795.16] - - [128, 5056, 1, 1280] - - [327, 6193.09] + - [422, 6193.09] - - [3584, 5056, 1, 1280] - - [379, 9499.17] + - [474, 9499.17] - - [256, 448, 1, 1280] - - [335, 4267.56] + - [430, 4267.56] - - [704, 704, 1, 128] - - [369, 2259.32] + - [464, 2259.32] - - [5056, 4, 1, 128] - - [387, 12.5313] + - [482, 12.5313] - - [704, 256, 1, 1280] - - [374, 4355.97] + - [469, 4355.97] - - [64, 2368, 1, 3328] - - [348, 6310.97] + - [443, 6310.97] - - [1856, 1024, 1, 128] - - [365, 4065.43] + - [460, 4065.43] - - [1856, 64, 1, 128] - - [304, 936.329] + - [399, 936.329] - - [64, 6784, 1, 1280] - - [327, 5731.8] + - [422, 5731.8] - - [704, 4288, 1, 256] - - [380, 5218.9] + - [475, 5218.9] - - [5888, 2368, 1, 1280] - - [374, 9378.9] + - [469, 9378.9] - - [128, 256, 1, 256] - - [344, 1219.37] + - [439, 1219.37] - - [256, 64, 1, 1280] - - [346, 1820.54] + - [441, 1820.54] - - [2368, 5888, 1, 1280] - - [385, 9143.64] + - [480, 9143.64] - - [5888, 256, 1, 1280] - - [374, 8678.47] + - [469, 8678.47] - - [4, 5888, 1, 1280] - - [321, 668.242] + - [416, 668.242] - - [704, 128, 1, 128] - - [309, 649.556] + - [404, 649.556] - - [1024, 4, 1, 1280] - - [340, 478.465] + - [435, 478.465] - - [2368, 1856, 1, 3328] - - [372, 8153.87] + - [467, 8153.87] - - [2368, 128, 1, 128] - - [307, 1858.21] + - [402, 1858.21] - - [2944, 704, 1, 256] - - [374, 8438.07] + - [469, 8438.07] - - [5056, 128, 1, 128] - - [303, 2689.63] + - [398, 2689.63] - - [256, 704, 1, 3328] - - [374, 4541.18] + - [469, 4541.18] - - [704, 3584, 1, 256] - - [375, 7771.07] + - [470, 7771.07] - - [1024, 1024, 1, 1024] - - [380, 8305.62] + - [475, 8305.62] - - [704, 2944, 1, 3328] - - [380, 9166.48] + - [475, 9166.48] - - [6784, 1024, 1, 128] - - [365, 4362.31] + - [460, 4362.31] - - [256, 448, 1, 128] - - [312, 899.614] + - [407, 899.614] - - [448, 1024, 1, 3328] - - [374, 7385.56] + - [469, 7385.56] - - [2944, 1024, 1, 3328] - - [377, 8779.81] + - [472, 8779.81] - - [2944, 5056, 1, 128] - - [369, 5103.11] + - [464, 5103.11] - - [1408, 6784, 1, 256] - - [380, 8346.89] + - [475, 8346.89] - - [6784, 1408, 1, 3328] - - [376, 8878.4] + - [471, 8878.4] - - [4288, 6784, 1, 128] - - [365, 5432.99] + - [460, 5432.99] - - [704, 64, 1, 256] - - [354, 1441.89] + - [449, 1441.89] - - [5888, 4, 1, 1280] - - [391, 636.641] + - [486, 636.641] - - [256, 2368, 1, 3328] - - [374, 6804.8] + - [469, 6804.8] - - [6784, 2944, 1, 1280] - - [373, 9472.26] + - [468, 9472.26] - - [4288, 1856, 1, 128] - - [369, 4886.38] + - [464, 4886.38] - - [1856, 2944, 1, 128] - - [366, 4642.96] + - [461, 4642.96] - - [6784, 448, 1, 128] - - [366, 4369.17] + - [461, 4369.17] - - [64, 3584, 1, 128] - - [313, 1645.85] + - [408, 1645.85] - - [448, 5056, 1, 1280] - - [374, 8553.64] + - [469, 8553.64] - - [2368, 1856, 1, 128] - - [366, 4741.85] + - [461, 4741.85] - - [128, 448, 1, 1280] - - [356, 3745.01] + - [451, 3745.01] - - [4288, 704, 1, 256] - - [374, 8444.16] + - [469, 8444.16] - - [256, 3584, 1, 128] - - [366, 2454.96] + - [461, 2454.96] - - [5888, 704, 1, 256] - - [374, 8819.57] + - [469, 8819.57] - - [3584, 1024, 1, 128] - - [369, 4094.96] + - [464, 4094.96] - - [256, 5888, 1, 3328] - - [383, 8538.33] + - [478, 8538.33] - - [1408, 4288, 1, 3328] - - [385, 9212.57] + - [480, 9212.57] - - [6784, 4288, 1, 256] - - [373, 9163.12] + - [468, 9163.12] - - [4288, 256, 1, 128] - - [366, 3081.44] + - [461, 3081.44] - - [5888, 256, 1, 256] - - [374, 7680.75] + - [469, 7680.75] - - [6784, 1024, 1, 1280] - - [385, 9248.63] + - [480, 9248.63] - - [5888, 1024, 1, 128] - - [369, 4061.94] + - [464, 4061.94] - - [1024, 128, 1, 256] - - [380, 2317.39] + - [475, 2317.39] - - [128, 64, 1, 3328] - - [363, 2116.79] + - [458, 2116.79] - - [448, 64, 1, 256] - - [346, 1079.52] + - [441, 1079.52] - - [2368, 256, 1, 128] - - [367, 2229.83] + - [462, 2229.83] - - [6784, 3584, 1, 1280] - - [380, 9096.6] + - [475, 9096.6] - - [1024, 6784, 1, 1280] - - [378, 9112.9] + - [473, 9112.9] - - [2944, 64, 1, 1280] - - [336, 4983.0] + - [431, 4983.0] - - [1408, 2944, 1, 1280] - - [375, 9131.63] + - [470, 9131.63] - - [256, 1856, 1, 256] - - [383, 4432.86] + - [478, 4432.86] - - [1408, 2368, 1, 3328] - - [383, 8449.18] + - [478, 8449.18] - - [2944, 4, 1, 3328] - - [329, 673.94] + - [424, 673.94] - - [128, 1408, 1, 3328] - - [348, 6582.47] + - [443, 6582.47] - - [2944, 1856, 1, 128] - - [366, 4827.54] + - [461, 4827.54] - - [256, 2944, 1, 128] - - [369, 2416.66] + - [464, 2416.66] - - [256, 6784, 1, 128] - - [369, 3118.76] + - [464, 3118.76] - - [2368, 4, 1, 128] - - [387, 22.7197] + - [482, 22.7197] - - [1408, 256, 1, 3328] - - [374, 3733.82] + - [469, 3733.82] - - [1856, 4, 1, 128] - - [386, 7.20009] + - [481, 7.20009] - - [1024, 16, 1, 512] - - [342, 1165.18] + - [437, 1165.18] - - [5056, 6784, 1, 128] - - [370, 4949.13] + - [465, 4949.13] - - [4288, 5056, 1, 128] - - [369, 4966.9] + - [464, 4966.9] - - [1856, 5888, 1, 128] - - [365, 4351.76] + - [460, 4351.76] - - [2944, 5888, 1, 256] - - [385, 8460.99] + - [480, 8460.99] - - [3584, 1856, 1, 256] - - [380, 8876.7] + - [475, 8876.7] - - [4288, 3584, 1, 1280] - - [373, 9603.7] + - [468, 9603.7] - - [2368, 448, 1, 256] - - [374, 6604.7] + - [469, 6604.7] - - [4288, 256, 1, 3328] - - [374, 7619.89] + - [469, 7619.89] - - [1856, 704, 1, 128] - - [366, 3629.61] + - [461, 3629.61] - - [1408, 64, 1, 256] - - [330, 2168.21] + - [425, 2168.21] - - [64, 1856, 1, 128] - - [306, 979.762] + - [401, 979.762] - - [4, 256, 1, 128] - - [387, 5.23595] + - [482, 5.23595] - - [704, 4288, 1, 3328] - - [380, 9014.52] + - [475, 9014.52] - - [704, 5888, 1, 128] - - [367, 4221.77] + - [462, 4221.77] - - [6784, 3584, 1, 128] - - [365, 5360.73] + - [460, 5360.73] - - [1024, 64, 1, 256] - - [325, 1588.85] + - [420, 1588.85] - - [64, 2368, 1, 256] - - [380, 2552.55] + - [475, 2552.55] - - [4288, 5056, 1, 3328] - - [379, 8193.38] + - [474, 8193.38] - - [4, 1856, 1, 1280] - - [329, 499.192] + - [424, 499.192] - - [4288, 128, 1, 128] - - [366, 2373.57] + - [461, 2373.57] - - [1408, 1408, 1, 128] - - [369, 3753.88] + - [464, 3753.88] - - [1024, 128, 1, 3328] - - [351, 5656.32] + - [446, 5656.32] - - [1856, 128, 1, 128] - - [302, 1617.58] + - [397, 1617.58] - - [5056, 2368, 1, 256] - - [385, 5553.41] + - [480, 5553.41] - - [4288, 704, 1, 3328] - - [373, 6962.06] + - [468, 6962.06] - - [448, 3584, 1, 256] - - [383, 5981.5] + - [478, 5981.5] - - [64, 128, 1, 128] - - [320, 74.9983] + - [415, 74.9983] - - [2368, 64, 1, 1280] - - [356, 5041.33] + - [451, 5041.33] - - [2368, 1024, 1, 1280] - - [381, 7740.97] + - [476, 7740.97] - - [2944, 1408, 1, 3328] - - [383, 9204.65] + - [478, 9204.65] - - [1408, 448, 1, 256] - - [380, 5954.4] + - [475, 5954.4] - - [1024, 1408, 1, 3328] - - [377, 8161.54] + - [472, 8161.54] - - [2560, 7133, 1, 2560] - - [372, 9636.69] + - [467, 9636.69] - - [1408, 4, 1, 1280] - - [324, 520.979] + - [419, 520.979] - - [5888, 3584, 1, 256] - - [385, 9225.26] + - [480, 9225.26] - - [128, 1024, 1, 1280] - - [327, 4755.55] + - [422, 4755.55] - - [1408, 1856, 1, 3328] - - [377, 9130.87] + - [472, 9130.87] - - [4, 4, 1, 3328] - - [397, 7.03333] + - [492, 7.03333] - - [6784, 1408, 1, 1280] - - [374, 9346.91] + - [469, 9346.91] - - [4, 1024, 1, 1280] - - [324, 422.913] + - [419, 422.913] - - [704, 2944, 1, 256] - - [380, 8332.06] + - [475, 8332.06] - - [704, 4288, 1, 128] - - [366, 4371.14] + - [461, 4371.14] - - [2368, 4288, 1, 128] - - [366, 3988.89] + - [461, 3988.89] - - [64, 4288, 1, 1280] - - [356, 5407.63] + - [451, 5407.63] - - [6784, 64, 1, 1280] - - [336, 5708.25] + - [431, 5708.25] - - [3584, 128, 1, 128] - - [302, 2463.2] + - [397, 2463.2] - - [1024, 6784, 1, 128] - - [367, 3862.12] + - [462, 3862.12] - - [4, 1856, 1, 128] - - [387, 30.6362] + - [482, 30.6362] - - [1408, 64, 1, 3328] - - [356, 6095.48] + - [451, 6095.48] - - [6784, 4, 1, 256] - - [389, 487.938] + - [484, 487.938] - - [1408, 1408, 1, 1280] - - [385, 8640.63] + - [480, 8640.63] - - [256, 2368, 1, 256] - - [377, 4282.36] + - [472, 4282.36] - - [448, 4288, 1, 3328] - - [374, 8516.13] + - [469, 8516.13] - - [2368, 1408, 1, 256] - - [380, 8632.19] + - [475, 8632.19] - - [5888, 5056, 1, 128] - - [366, 5091.11] + - [461, 5091.11] - - [704, 2368, 1, 256] - - [380, 7664.8] + - [475, 7664.8] - - [2944, 448, 1, 1280] - - [380, 7618.35] + - [475, 7618.35] - - [5888, 2368, 1, 3328] - - [383, 9343.48] + - [478, 9343.48] - - [64, 2944, 1, 1280] - - [348, 5162.18] + - [443, 5162.18] - - [448, 1856, 1, 1280] - - [374, 7028.0] + - [469, 7028.0] - - [4288, 448, 1, 1280] - - [374, 5855.76] + - [469, 5855.76] - - [5888, 704, 1, 3328] - - [383, 9190.91] + - [478, 9190.91] - - [5056, 256, 1, 128] - - [369, 3235.94] + - [464, 3235.94] - - [1856, 256, 1, 128] - - [367, 1849.78] + - [462, 1849.78] - - [5056, 128, 1, 256] - - [380, 6109.06] + - [475, 6109.06] - - [704, 4, 1, 256] - - [340, 125.256] + - [435, 125.256] - - [1408, 5888, 1, 128] - - [366, 5055.16] + - [461, 5055.16] - - [4288, 4, 1, 128] - - [386, 95.7209] + - [481, 95.7209] - - [1408, 1024, 1, 256] - - [374, 7370.28] + - [469, 7370.28] - - [1024, 1856, 1, 128] - - [366, 2966.8] + - [461, 2966.8] - - [256, 704, 1, 128] - - [368, 528.229] + - [463, 528.229] - - [256, 1024, 1, 128] - - [366, 1171.69] + - [461, 1171.69] - - [448, 1024, 1, 256] - - [380, 5624.65] + - [475, 5624.65] - - [128, 4, 1, 3328] - - [397, 191.985] + - [492, 191.985] - - [5056, 6784, 1, 1280] - - [374, 9544.07] + - [469, 9544.07] - - [704, 5056, 1, 3328] - - [381, 8790.35] + - [476, 8790.35] - - [64, 1408, 1, 1280] - - [348, 4505.7] + - [443, 4505.7] - - [3584, 5056, 1, 3328] - - [379, 9073.52] + - [474, 9073.52] - - [1856, 4, 1, 3328] - - [397, 612.875] + - [492, 612.875] - - [4, 2944, 1, 128] - - [386, 72.0145] + - [481, 72.0145] - - [2368, 2944, 1, 3328] - - [372, 9314.68] + - [467, 9314.68] - - [448, 448, 1, 1280] - - [356, 5129.91] + - [451, 5129.91] - - [2368, 3584, 1, 256] - - [374, 8998.8] + - [469, 8998.8] - - [5056, 3584, 1, 1280] - - [375, 9345.17] + - [470, 9345.17] - - [448, 4, 1, 3328] - - [397, 487.337] + - [492, 487.337] - - [1856, 2944, 1, 1280] - - [385, 8438.79] + - [480, 8438.79] - - [3584, 2368, 1, 1280] - - [380, 9298.9] + - [475, 9298.9] - - [128, 1024, 1, 256] - - [332, 2356.45] + - [427, 2356.45] - - [2944, 1408, 1, 256] - - [372, 5440.82] + - [467, 5440.82] - - [4288, 1408, 1, 3328] - - [372, 9386.09] + - [467, 9386.09] - - [3584, 64, 1, 3328] - - [328, 6310.97] + - [423, 6310.97] - - [1408, 128, 1, 256] - - [374, 2942.53] + - [469, 2942.53] - - [2944, 1024, 1, 128] - - [369, 3927.99] + - [464, 3927.99] - - [4288, 5056, 1, 1280] - - [376, 8328.58] + - [471, 8328.58] - - [5888, 6784, 1, 1280] - - [385, 9757.44] + - [480, 9757.44] - - [6784, 5056, 1, 128] - - [365, 5101.4] + - [460, 5101.4] - - [256, 1024, 1, 3328] - - [374, 6475.87] + - [469, 6475.87] - - [3584, 4, 1, 256] - - [390, 420.973] + - [485, 420.973] - - [1856, 64, 1, 3328] - - [356, 6409.2] + - [451, 6409.2] - - [64, 6784, 1, 128] - - [304, 2387.32] + - [399, 2387.32] - - [5888, 1408, 1, 3328] - - [379, 9655.89] + - [474, 9655.89] - - [5888, 64, 1, 1280] - - [374, 5870.86] + - [469, 5870.86] - - [256, 5056, 1, 256] - - [377, 6109.06] + - [472, 6109.06] - - [128, 3584, 1, 128] - - [307, 2383.23] + - [402, 2383.23] - - [448, 3584, 1, 3328] - - [372, 7092.28] + - [467, 7092.28] - - [704, 2368, 1, 128] - - [366, 3741.08] + - [461, 3741.08] - - [5888, 256, 1, 128] - - [367, 2977.54] + - [462, 2977.54] - - [4, 5056, 1, 128] - - [386, 132.72] + - [481, 132.72] - - [448, 256, 1, 256] - - [338, 2308.29] + - [433, 2308.29] - - [704, 4, 1, 3328] - - [343, 552.674] + - [438, 552.674] - - [1408, 256, 1, 256] - - [374, 4577.22] + - [469, 4577.22] - - [3584, 1856, 1, 128] - - [366, 4571.86] + - [461, 4571.86] - - [4288, 4288, 1, 128] - - [369, 5284.65] + - [464, 5284.65] - - [1856, 1024, 1, 3328] - - [380, 6362.25] + - [475, 6362.25] - - [128, 5888, 1, 3328] - - [350, 7040.83] + - [445, 7040.83] - - [1024, 5056, 1, 256] - - [385, 7855.7] + - [480, 7855.7] - - [2368, 1408, 1, 3328] - - [380, 9205.66] + - [475, 9205.66] - - [5888, 448, 1, 256] - - [377, 5538.84] + - [472, 5538.84] - - [5888, 6784, 1, 128] - - [365, 4500.85] + - [460, 4500.85] - - [2368, 4, 1, 3328] - - [343, 642.898] + - [438, 642.898] - - [6784, 5056, 1, 1280] - - [381, 9249.23] + - [476, 9249.23] - - [5056, 704, 1, 1280] - - [380, 8883.37] + - [475, 8883.37] - - [1408, 256, 1, 1280] - - [374, 5632.1] + - [469, 5632.1] - - [4288, 6784, 1, 1280] - - [380, 8843.31] + - [475, 8843.31] - - [128, 704, 1, 256] - - [338, 2045.19] + - [433, 2045.19] - - [448, 128, 1, 1280] - - [348, 3807.17] + - [443, 3807.17] - - [6784, 4, 1, 3328] - - [391, 684.671] + - [486, 684.671] - - [4288, 4, 1, 1280] - - [340, 601.925] + - [435, 601.925] - - [1024, 64, 1, 3328] - - [352, 3928.48] + - [447, 3928.48] - - [1856, 4, 1, 256] - - [390, 293.394] + - [485, 293.394] - - [64, 3584, 1, 1280] - - [374, 5265.55] + - [469, 5265.55] - - [6784, 1408, 1, 256] - - [374, 9059.36] + - [469, 9059.36] - - [3584, 5888, 1, 128] - - [366, 5084.29] + - [461, 5084.29] - - [5056, 5888, 1, 256] - - [385, 8590.09] + - [480, 8590.09] - - [2368, 1024, 1, 256] - - [377, 4493.13] + - [472, 4493.13] - - [2944, 1856, 1, 256] - - [383, 5202.41] + - [478, 5202.41] - - [1856, 6784, 1, 1280] - - [381, 9071.48] + - [476, 9071.48] - - [64, 5056, 1, 128] - - [304, 2038.42] + - [399, 2038.42] - - [5888, 64, 1, 128] - - [303, 2016.59] + - [398, 2016.59] - - [448, 704, 1, 128] - - [367, 1173.65] + - [462, 1173.65] - - [4, 1024, 1, 128] - - [386, 8.89685] + - [481, 8.89685] - - [4288, 3584, 1, 256] - - [380, 9080.26] + - [475, 9080.26] - - [1408, 704, 1, 128] - - [366, 3165.71] + - [461, 3165.71] - - [64, 256, 1, 3328] - - [360, 3126.59] + - [455, 3126.59] - - [5056, 1856, 1, 1280] - - [377, 8857.55] + - [472, 8857.55] - - [1408, 1024, 1, 3328] - - [383, 8177.12] + - [478, 8177.12] - - [2368, 256, 1, 3328] - - [374, 6810.31] + - [469, 6810.31] - - [5888, 3584, 1, 1280] - - [372, 9535.55] + - [467, 9535.55] - - [1856, 3584, 1, 3328] - - [374, 9281.91] + - [469, 9281.91] - - [5888, 128, 1, 1280] - - [380, 8136.82] + - [475, 8136.82] - - [1024, 2944, 1, 256] - - [372, 7247.96] + - [467, 7247.96] - - [448, 6784, 1, 1280] - - [380, 7014.04] + - [475, 7014.04] - - [256, 3584, 1, 1280] - - [374, 7738.64] + - [469, 7738.64] - - [448, 128, 1, 128] - - [304, 496.048] + - [399, 496.048] - - [704, 5056, 1, 256] - - [380, 8609.44] + - [475, 8609.44] - - [3584, 1024, 1, 3328] - - [373, 7765.73] + - [468, 7765.73] - - [2944, 1856, 1, 1280] - - [385, 7776.03] + - [480, 7776.03] - - [128, 256, 1, 128] - - [317, 296.308] + - [412, 296.308] - - [5056, 256, 1, 256] - - [374, 7829.73] + - [469, 7829.73] - - [2368, 3584, 1, 3328] - - [373, 8896.08] + - [468, 8896.08] - - [2944, 704, 1, 1280] - - [383, 6855.83] + - [478, 6855.83] - - [128, 4, 1, 256] - - [392, 24.9242] + - [487, 24.9242] - - [2944, 3584, 1, 1280] - - [385, 9049.22] + - [480, 9049.22] - - [1856, 5888, 1, 1280] - - [380, 9432.06] + - [475, 9432.06] - - [256, 256, 1, 1280] - - [345, 3942.12] + - [440, 3942.12] - - [5056, 448, 1, 3328] - - [385, 4587.83] + - [480, 4587.83] - - [4288, 1408, 1, 256] - - [385, 5408.83] + - [480, 5408.83] - - [3584, 64, 1, 256] - - [354, 2496.71] + - [449, 2496.71] - - [64, 1856, 1, 3328] - - [327, 5896.78] + - [422, 5896.78] - - [256, 1408, 1, 128] - - [366, 1643.17] + - [461, 1643.17] - - [5888, 1408, 1, 128] - - [365, 4436.37] + - [460, 4436.37] - - [4288, 2368, 1, 1280] - - [374, 9433.04] + - [469, 9433.04] - - [4, 4288, 1, 256] - - [389, 442.732] + - [484, 442.732] - - [256, 4288, 1, 128] - - [366, 2814.79] + - [461, 2814.79] - - [256, 128, 1, 3328] - - [355, 3951.26] + - [450, 3951.26] - - [6784, 2368, 1, 256] - - [374, 9169.99] + - [469, 9169.99] - - [5888, 128, 1, 128] - - [303, 3156.81] + - [398, 3156.81] - - [4288, 1856, 1, 256] - - [380, 5658.23] + - [475, 5658.23] - - [1856, 256, 1, 3328] - - [374, 7646.37] + - [469, 7646.37] - - [1856, 2944, 1, 256] - - [381, 6444.98] + - [476, 6444.98] - - [5056, 1024, 1, 128] - - [365, 4607.3] + - [460, 4607.3] - - [64, 5888, 1, 1280] - - [380, 5842.46] + - [475, 5842.46] - - [1760, 7133, 1, 1760] - - [373, 9097.84] + - [468, 9097.84] - - [6784, 256, 1, 128] - - [366, 3685.41] + - [461, 3685.41] - - [5888, 704, 1, 128] - - [365, 3656.23] + - [460, 3656.23] - - [6784, 64, 1, 128] - - [316, 2191.52] + - [411, 2191.52] - - [1024, 4288, 1, 1280] - - [380, 9199.32] + - [475, 9199.32] - - [2368, 5056, 1, 3328] - - [376, 9072.88] + - [471, 9072.88] - - [448, 4, 1, 128] - - [387, 5.42937] + - [482, 5.42937] - - [4, 256, 1, 3328] - - [397, 311.037] + - [492, 311.037] - - [4288, 1024, 1, 3328] - - [378, 8660.33] + - [473, 8660.33] - - [1024, 5056, 1, 3328] - - [374, 8886.76] + - [469, 8886.76] - - [1024, 1856, 1, 3328] - - [379, 8426.24] + - [474, 8426.24] - - [704, 704, 1, 1280] - - [374, 7661.8] + - [469, 7661.8] - - [128, 2368, 1, 1280] - - [348, 5746.15] + - [443, 5746.15] - - [1408, 128, 1, 3328] - - [356, 6530.87] + - [451, 6530.87] - - [3584, 256, 1, 1280] - - [380, 7634.04] + - [475, 7634.04] - - [4, 128, 1, 128] - - [387, 2.07874] + - [482, 2.07874] - - [704, 6784, 1, 128] - - [369, 4589.59] + - [464, 4589.59] - - [3584, 128, 1, 1280] - - [374, 7078.24] + - [469, 7078.24] - - [4, 256, 1, 1280] - - [343, 178.187] + - [438, 178.187] - - [128, 704, 1, 3328] - - [348, 5959.81] + - [443, 5959.81] - - [4288, 6784, 1, 256] - - [374, 9326.54] + - [469, 9326.54] - - [3584, 2944, 1, 3328] - - [376, 9114.16] + - [471, 9114.16] - - [128, 1856, 1, 256] - - [380, 3672.65] + - [475, 3672.65] - - [64, 4288, 1, 256] - - [374, 3457.51] + - [469, 3457.51] - - [4, 3584, 1, 3328] - - [323, 694.37] + - [418, 694.37] - - [64, 4, 1, 3328] - - [343, 71.5738] + - [438, 71.5738] - - [4, 64, 1, 3328] - - [343, 91.9069] + - [438, 91.9069] - - [5888, 2944, 1, 256] - - [373, 7241.55] + - [468, 7241.55] - - [2368, 6784, 1, 128] - - [369, 5229.63] + - [464, 5229.63] - - [448, 4288, 1, 1280] - - [374, 8416.4] + - [469, 8416.4] - - [448, 1856, 1, 3328] - - [374, 7161.56] + - [469, 7161.56] - - [4, 1024, 1, 256] - - [340, 187.346] + - [435, 187.346] - - [5056, 4288, 1, 256] - - [385, 8947.26] + - [480, 8947.26] - - [1024, 448, 1, 256] - - [380, 5318.96] + - [475, 5318.96] - - [1024, 3584, 1, 256] - - [375, 6152.04] + - [470, 6152.04] - - [2944, 128, 1, 1280] - - [356, 6053.63] + - [451, 6053.63] - - [1856, 5056, 1, 128] - - [366, 5091.42] + - [461, 5091.42] - - [64, 256, 1, 256] - - [329, 771.112] + - [424, 771.112] - - [1408, 4, 1, 128] - - [386, 40.8758] + - [481, 40.8758] - - [128, 2368, 1, 128] - - [314, 1520.37] + - [409, 1520.37] - - [256, 704, 1, 1280] - - [374, 4329.81] + - [469, 4329.81] - - [64, 2368, 1, 128] - - [305, 1212.52] + - [400, 1212.52] - - [6784, 6784, 1, 3328] - - [385, 8310.67] + - [480, 8310.67] - - [448, 5888, 1, 1280] - - [380, 8502.33] + - [475, 8502.33] - - [5056, 448, 1, 128] - - [366, 4161.0] + - [461, 4161.0] - - [3584, 2944, 1, 128] - - [366, 4363.51] + - [461, 4363.51] - - [6784, 256, 1, 1280] - - [380, 8629.67] + - [475, 8629.67] - - [256, 2944, 1, 1280] - - [380, 7277.48] + - [475, 7277.48] - - [64, 4288, 1, 128] - - [305, 1822.06] + - [400, 1822.06] - - [2368, 5888, 1, 3328] - - [374, 9017.52] + - [469, 9017.52] - - [4, 64, 1, 256] - - [340, 16.1627] + - [435, 16.1627] - - [704, 1024, 1, 3328] - - [380, 8059.55] + - [475, 8059.55] - - [2368, 1856, 1, 1280] - - [380, 8813.24] + - [475, 8813.24] - - [128, 448, 1, 128] - - [301, 588.244] + - [396, 588.244] - - [128, 6784, 1, 256] - - [380, 6538.28] + - [475, 6538.28] - - [3584, 4288, 1, 128] - - [366, 5025.46] + - [461, 5025.46] - - [64, 448, 1, 128] - - [318, 231.793] + - [413, 231.793] - - [5888, 4288, 1, 3328] - - [374, 9515.88] + - [469, 9515.88] - - [2368, 704, 1, 256] - - [380, 7642.84] + - [475, 7642.84] - - [256, 1856, 1, 3328] - - [380, 6547.17] + - [475, 6547.17] - - [1856, 128, 1, 256] - - [374, 3782.28] + - [469, 3782.28] - - [6784, 128, 1, 128] - - [308, 2835.54] + - [403, 2835.54] - - [3584, 1408, 1, 128] - - [365, 3049.21] + - [460, 3049.21] - - [1856, 5056, 1, 1280] - - [381, 8863.3] + - [476, 8863.3] - - [2944, 1024, 1, 1280] - - [385, 8873.25] + - [480, 8873.25] - - [5056, 4, 1, 256] - - [321, 494.121] + - [416, 494.121] - - [3584, 5888, 1, 3328] - - [373, 9585.25] + - [468, 9585.25] - - [2368, 4288, 1, 256] - - [385, 6419.05] + - [480, 6419.05] - - [1024, 2368, 1, 3328] - - [380, 8645.36] + - [475, 8645.36] - - [64, 704, 1, 3328] - - [362, 4399.93] + - [457, 4399.93] - - [704, 1408, 1, 256] - - [374, 7428.54] + - [469, 7428.54] - - [6784, 1856, 1, 3328] - - [385, 9163.66] + - [480, 9163.66] - - [1024, 2944, 1, 128] - - [369, 3551.98] + - [464, 3551.98] - - [1024, 3584, 1, 1280] - - [383, 9112.47] + - [478, 9112.47] - - [4288, 5888, 1, 3328] - - [373, 8524.05] + - [468, 8524.05] - - [4288, 4, 1, 3328] - - [340, 620.016] + - [435, 620.016] - - [256, 1408, 1, 256] - - [374, 4505.7] + - [469, 4505.7] - - [448, 2944, 1, 1280] - - [374, 7612.87] + - [469, 7612.87] - - [4, 5888, 1, 128] - - [386, 174.564] + - [481, 174.564] - - [1024, 2944, 1, 3328] - - [379, 9136.74] + - [474, 9136.74] - - [3584, 6784, 1, 256] - - [379, 7253.89] + - [474, 7253.89] - - [256, 6784, 1, 1280] - - [374, 8637.72] + - [469, 8637.72] - - [1856, 3584, 1, 256] - - [380, 8199.67] + - [475, 8199.67] - - [128, 448, 1, 3328] - - [361, 4799.92] + - [456, 4799.92] - - [6784, 1856, 1, 128] - - [366, 5185.62] + - [461, 5185.62] - - [4, 448, 1, 256] - - [340, 86.9848] + - [435, 86.9848] - - [2944, 704, 1, 128] - - [369, 3798.64] + - [464, 3798.64] - - [256, 5888, 1, 1280] - - [374, 8678.47] + - [469, 8678.47] - - [4, 128, 1, 1280] - - [343, 102.5] + - [438, 102.5] - - [4288, 6784, 1, 3328] - - [379, 8209.4] + - [474, 8209.4] - - [6784, 128, 1, 1280] - - [356, 6562.99] + - [451, 6562.99] - - [64, 1408, 1, 256] - - [346, 2059.8] + - [441, 2059.8] - - [7680, 5481, 1, 2560] - - [385, 9426.79] + - [480, 9426.79] - - [2368, 1408, 1, 128] - - [366, 4532.5] + - [461, 4532.5] - - [1856, 448, 1, 256] - - [374, 6275.48] + - [469, 6275.48] - - [1408, 1024, 1, 128] - - [366, 3604.58] + - [461, 3604.58] - - [128, 64, 1, 128] - - [301, 87.4813] + - [396, 87.4813] - - [6784, 3584, 1, 3328] - - [381, 8991.92] + - [476, 8991.92] - - [2944, 64, 1, 3328] - - [350, 6043.36] + - [445, 6043.36] - - [64, 64, 1, 128] - - [306, 36.309] + - [401, 36.309] - - [2368, 5056, 1, 1280] - - [380, 9438.48] + - [475, 9438.48] - - [64, 4, 1, 1280] - - [343, 40.2569] + - [438, 40.2569] - - [1408, 2368, 1, 1280] - - [376, 7738.16] + - [471, 7738.16] - - [128, 1408, 1, 1280] - - [348, 4937.74] + - [443, 4937.74] - - [256, 64, 1, 3328] - - [358, 2683.46] + - [453, 2683.46] - - [2944, 4288, 1, 128] - - [366, 5173.81] + - [461, 5173.81] - - [2944, 2944, 1, 256] - - [374, 8943.92] + - [469, 8943.92] - - [2944, 4, 1, 1280] - - [323, 617.857] + - [418, 617.857] - - [5888, 4, 1, 256] - - [389, 483.218] + - [484, 483.218] - - [6784, 256, 1, 256] - - [380, 7916.7] + - [475, 7916.7] - - [256, 5056, 1, 3328] - - [374, 8953.25] + - [469, 8953.25] - - [128, 4288, 1, 1280] - - [327, 6015.05] + - [422, 6015.05] - - [5056, 1856, 1, 128] - - [368, 4221.15] + - [463, 4221.15] - - [5888, 1408, 1, 256] - - [379, 9144.85] + - [474, 9144.85] - - [128, 128, 1, 256] - - [329, 759.938] + - [424, 759.938] - - [5056, 4, 1, 3328] - - [389, 642.818] + - [484, 642.818] - - [4288, 3584, 1, 3328] - - [375, 9300.05] + - [470, 9300.05] - - [448, 704, 1, 3328] - - [381, 4481.08] + - [476, 4481.08] - - [448, 448, 1, 128] - - [305, 1360.81] + - [400, 1360.81] - - [1024, 2368, 1, 1280] - - [374, 8570.29] + - [469, 8570.29] - - [1856, 704, 1, 3328] - - [374, 8448.26] + - [469, 8448.26] - - [4, 2368, 1, 128] - - [386, 64.5902] + - [481, 64.5902] - - [5888, 6784, 1, 3328] - - [381, 9447.12] + - [476, 9447.12] - - [704, 4288, 1, 1280] - - [383, 7476.87] + - [478, 7476.87] - - [704, 256, 1, 256] - - [374, 2957.62] + - [469, 2957.62] - - [6784, 448, 1, 3328] - - [377, 8886.22] + - [472, 8886.22] - - [4288, 1024, 1, 128] - - [365, 3864.49] + - [460, 3864.49] - - [49, 512, 128, 2048] - - [408, 7112.78] + - [503, 7112.78] - - [196, 256, 256, 1024] - - [402, 8302.7] + - [497, 8302.7] - - [784, 512, 256, 128] - - [400, 9061.36] + - [495, 9061.36] - - [49, 2048, 128, 512] - - [398, 6963.36] - - - [784, 512, 64, 128] - - [400, 8822.62] + - [493, 6963.36] - - [784, 128, 128, 512] - - [407, 8983.63] + - [502, 8983.63] - - [196, 256, 64, 1024] - - [406, 7823.5] + - [501, 7823.5] - - [3136, 256, 256, 64] - - [403, 9051.38] + - [498, 9051.38] - - [3136, 64, 128, 64] - - [399, 8581.35] + - [494, 8581.35] - - [49, 2048, 256, 512] - - [398, 7049.64] - - - [196, 1024, 64, 256] - - [401, 7953.69] + - [493, 7049.64] - - [784, 128, 256, 512] - - [409, 9102.99] + - [504, 9102.99] - - [196, 256, 128, 1024] - - [401, 8085.89] - - - [3136, 64, 64, 256] - - [405, 9266.13] - - - [784, 128, 64, 512] - - [406, 8809.39] - - - [49, 2048, 64, 512] - - [398, 6843.95] + - [496, 8085.89] - - [3136, 64, 128, 256] - - [405, 9381.39] + - [500, 9381.39] - - [3136, 256, 128, 64] - - [403, 8982.64] + - [498, 8982.64] - - [784, 512, 128, 128] - - [400, 8965.99] - - - [3136, 256, 64, 64] - - [403, 8879.8] + - [495, 8965.99] - - [3136, 64, 256, 256] - - [405, 9566.43] - - - [3136, 64, 64, 64] - - [404, 8314.05] + - [500, 9566.43] - - [3136, 64, 256, 64] - - [399, 8743.8] + - [494, 8743.8] - - [196, 1024, 128, 256] - - [402, 8119.43] - - - [49, 512, 64, 2048] - - [410, 7055.41] + - [497, 8119.43] - - [49, 512, 256, 2048] - - [411, 7166.41] + - [506, 7166.41] - - [196, 1024, 256, 256] - - [402, 8210.66] + - [497, 8210.66] - - [5329, 160, 64, 64] - - [418, 8156.89] + - [513, 8156.89] - - [1225, 288, 64, 48] - - [422, 6926.23] + - [517, 6926.23] - - [1225, 192, 64, 64] - - [424, 7840.1] + - [519, 7840.1] - - [64, 1280, 64, 384] - - [425, 9276.11] + - [520, 9276.11] - - [1225, 384, 64, 192] - - [415, 9162.35] + - [510, 9162.35] - - [1225, 288, 64, 64] - - [416, 7495.27] + - [511, 7495.27] - - [5329, 64, 64, 80] - - [417, 8480.13] + - [512, 8480.13] - - [289, 1024, 64, 256] - - [415, 8483.83] + - [510, 8483.83] - - [289, 768, 64, 192] - - [421, 8234.84] + - [516, 8234.84] - - [289, 768, 64, 128] - - [421, 7988.81] + - [516, 7988.81] - - [64, 1536, 64, 384] - - [425, 9323.65] + - [520, 9323.65] - - [1225, 384, 64, 64] - - [424, 8158.8] + - [519, 8158.8] - - [64, 2048, 64, 192] - - [421, 8818.61] + - [516, 8818.61] - - [64, 1280, 64, 320] - - [417, 9202.17] + - [512, 9202.17] - - [1225, 384, 64, 96] - - [415, 8540.7] + - [510, 8540.7] - - [64, 1280, 64, 448] - - [421, 9317.82] + - [516, 9317.82] - - [289, 768, 64, 160] - - [425, 8128.81] + - [520, 8128.81] - - [1225, 192, 64, 32] - - [424, 6495.37] + - [519, 6495.37] - - [64, 1536, 64, 256] - - [421, 9143.0] + - [516, 9143.0] - - [1225, 256, 64, 48] - - [419, 7545.36] + - [514, 7545.36] - - [1225, 256, 64, 64] - - [420, 7972.45] + - [515, 7972.45] - - [1225, 192, 64, 48] - - [423, 7348.9] + - [518, 7348.9] - - [289, 1024, 64, 384] - - [413, 8725.66] + - [508, 8725.66] - - [289, 1024, 64, 192] - - [415, 8313.16] + - [510, 8313.16] - - [64, 1280, 64, 192] - - [417, 8768.68] + - [512, 8768.68] - - [64, 2048, 64, 320] - - [414, 9147.98] + - [509, 9147.98] - - [64, 2048, 64, 448] - - [412, 9304.16] + - [507, 9304.16] - - [64, 2048, 64, 384] - - [414, 9235.28] + - [509, 9235.28] - - [289, 1024, 64, 128] - - [421, 7989.51] + - [516, 7989.51] - - [4096, 1024, 1, 2984] - - [460, 9846.39] + - [555, 9846.39] - - [1024, 4096, 1, 3437] - - [461, 9915.8] + - [556, 9915.8] - - [1024, 4096, 1, 3235] - - [454, 9914.02] + - [549, 9914.02] - - [4096, 1024, 1, 4032] - - [460, 9926.06] + - [555, 9926.06] - - [1024, 4096, 1, 3334] - - [461, 9918.27] + - [556, 9918.27] - - [4096, 1024, 1, 3288] - - [461, 9854.67] + - [556, 9854.67] - - [1024, 4096, 1, 3515] - - [461, 9924.03] + - [556, 9924.03] - - [4096, 1024, 1, 3437] - - [461, 9869.63] + - [556, 9869.63] - - [1024, 4096, 1, 3259] - - [461, 9907.65] + - [556, 9907.65] - - [1024, 4096, 1, 3384] - - [453, 9921.21] + - [548, 9921.21] - - [64, 92, 688, 92] - - [431, 6137.89] + - [526, 6137.89] - - [4096, 1024, 1, 3458] - - [460, 9887.69] + - [555, 9887.69] - - [1024, 4096, 1, 3412] - - [460, 9930.56] + - [555, 9930.56] - - [1024, 4096, 1, 3529] - - [454, 9924.54] + - [549, 9924.54] - - [1024, 4096, 1, 4032] - - [461, 9963.48] + - [556, 9963.48] - - [4096, 1024, 1, 3999] - - [461, 9895.0] + - [556, 9895.0] - - [1024, 4096, 1, 3079] - - [454, 9894.58] + - [549, 9894.58] - - [1024, 4096, 1, 3876] - - [453, 9949.39] + - [548, 9949.39] - - [1024, 4096, 1, 3450] - - [461, 9915.65] + - [556, 9915.65] - - [1024, 4096, 1, 3256] - - [461, 9911.18] + - [556, 9911.18] - - [4096, 1024, 1, 3403] - - [460, 9858.93] + - [555, 9858.93] - - [1024, 1024, 1, 3975] - - [451, 8990.81] + - [546, 8990.81] - - [1024, 4096, 1, 3359] - - [461, 9915.0] + - [556, 9915.0] - - [4096, 1024, 1, 3549] - - [460, 9870.66] + - [555, 9870.66] - - [4096, 1024, 1, 3176] - - [461, 9855.92] + - [556, 9855.92] - - [1024, 4096, 1, 3504] - - [453, 9934.17] + - [548, 9934.17] - - [4096, 1024, 1, 3314] - - [460, 9873.9] + - [555, 9873.9] - - [4096, 1024, 1, 3183] - - [460, 9843.84] + - [555, 9843.84] - - [1024, 4096, 1, 3209] - - [454, 9904.97] + - [549, 9904.97] - - [1024, 4096, 1, 3720] - - [453, 9934.16] + - [548, 9934.16] - - [1024, 4096, 1, 3859] - - [453, 9952.53] + - [548, 9952.53] - - [1024, 33708, 1, 4059] - - [453, 10321.5] + - [548, 10321.5] - - [1024, 4096, 1, 3968] - - [453, 9955.96] + - [548, 9955.96] - - [64, 123, 528, 123] - - [426, 6916.21] + - [521, 6916.21] - - [4096, 1024, 1, 3477] - - [461, 9872.03] + - [556, 9872.03] - - [4096, 1024, 1, 3233] - - [461, 9862.35] + - [556, 9862.35] - - [4096, 1024, 1, 3409] - - [461, 9876.86] + - [556, 9876.86] - - [4096, 1024, 1, 3564] - - [461, 9870.49] + - [556, 9870.49] - - [64, 102, 624, 100] - - [426, 5773.16] + - [521, 5773.16] - - [4096, 1024, 1, 3190] - - [460, 9850.97] + - [555, 9850.97] - - [64, 112, 576, 111] - - [426, 6517.35] + - [521, 6517.35] - - [1024, 4096, 1, 3288] - - [460, 9911.9] + - [555, 9911.9] - - [4096, 1024, 1, 3451] - - [460, 9859.61] + - [555, 9859.61] - - [1024, 4096, 1, 3348] - - [453, 9915.47] + - [548, 9915.47] - - [64, 102, 624, 102] - - [426, 5783.7] + - [521, 5783.7] - - [1024, 4096, 1, 3465] - - [454, 9913.12] + - [549, 9913.12] - - [1024, 33708, 1, 4032] - - [453, 10340.4] + - [548, 10340.4] - - [1024, 33708, 1, 3840] - - [453, 10341.8] + - [548, 10341.8] - - [4096, 1024, 1, 3391] - - [461, 9861.77] + - [556, 9861.77] - - [1024, 4096, 1, 3530] - - [453, 9920.44] + - [548, 9920.44] - - [4096, 1024, 1, 3209] - - [460, 9847.0] + - [555, 9847.0] - - [1024, 4096, 1, 3457] - - [454, 9917.29] + - [549, 9917.29] - - [1024, 4096, 1, 3386] - - [453, 9917.65] + - [548, 9917.65] - - [4096, 1024, 1, 3350] - - [460, 9884.54] + - [555, 9884.54] - - [1024, 4096, 1, 3184] - - [461, 9925.98] + - [556, 9925.98] - - [1024, 4096, 1, 3093] - - [460, 9902.55] + - [555, 9902.55] - - [64, 133, 480, 135] - - [443, 6205.97] + - [538, 6205.97] - - [1024, 4096, 1, 3400] - - [453, 9917.1] + - [548, 9917.1] - - [1024, 1024, 1, 4026] - - [459, 9014.39] + - [554, 9014.39] - - [1024, 4096, 1, 3214] - - [453, 9895.94] + - [548, 9895.94] - - [4096, 1024, 1, 3406] - - [461, 9857.82] + - [556, 9857.82] - - [1024, 4096, 1, 3565] - - [460, 9919.37] + - [555, 9919.37] - - [4096, 1024, 1, 3536] - - [461, 9889.06] + - [556, 9889.06] - - [1024, 4096, 1, 3183] - - [460, 9907.55] + - [555, 9907.55] - - [1024, 4096, 1, 3462] - - [461, 9922.4] + - [556, 9922.4] - - [4096, 1024, 1, 3130] - - [454, 9846.04] + - [549, 9846.04] - - [4096, 1024, 1, 3381] - - [461, 9868.27] + - [556, 9868.27] - - [4096, 1024, 1, 3298] - - [460, 9870.54] + - [555, 9870.54] - - [1024, 4096, 1, 3292] - - [453, 9906.3] + - [548, 9906.3] - - [4096, 1024, 1, 3289] - - [460, 9856.55] + - [555, 9856.55] - - [64, 160, 400, 159] - - [446, 7427.84] + - [541, 7427.84] - - [1024, 4096, 1, 3379] - - [453, 9917.09] + - [548, 9917.09] - - [1024, 4096, 1, 3990] - - [454, 9947.37] + - [549, 9947.37] - - [1024, 4096, 1, 3540] - - [461, 9935.76] + - [556, 9935.76] - - [4096, 1024, 1, 3412] - - [461, 9867.56] + - [556, 9867.56] - - [1024, 1024, 1, 3780] - - [456, 9036.26] + - [551, 9036.26] - - [1024, 4096, 1, 3555] - - [460, 9927.37] + - [555, 9927.37] - - [1024, 4096, 1, 3518] - - [454, 9925.55] + - [549, 9925.55] - - [4096, 1024, 1, 3189] - - [460, 9861.24] + - [555, 9861.24] - - [1024, 4096, 1, 3298] - - [454, 9923.22] + - [549, 9923.22] - - [4096, 1024, 1, 3072] - - [460, 9872.08] + - [555, 9872.08] - - [1024, 4096, 1, 3393] - - [461, 9929.28] + - [556, 9929.28] - - [1024, 4096, 1, 3207] - - [453, 9912.81] + - [548, 9912.81] - - [64, 228, 272, 232] - - [449, 7350.14] + - [544, 7350.14] - - [64, 23, 2720, 23] - - [430, 2640.25] + - [525, 2640.25] - - [4096, 1024, 1, 3487] - - [461, 9860.91] + - [556, 9860.91] - - [1024, 1024, 1, 3822] - - [459, 8993.96] + - [554, 8993.96] - - [64, 77, 816, 77] - - [431, 5273.19] + - [526, 5273.19] - - [4096, 1024, 1, 3431] - - [461, 9867.53] + - [556, 9867.53] - - [4096, 1024, 1, 3378] - - [460, 9888.14] + - [555, 9888.14] - - [4096, 1024, 1, 3529] - - [454, 9879.5] + - [549, 9879.5] - - [4096, 1024, 1, 3460] - - [461, 9877.25] + - [556, 9877.25] - - [1024, 4096, 1, 3336] - - [453, 9912.41] + - [548, 9912.41] - - [1024, 4096, 1, 3501] - - [454, 9914.4] + - [549, 9914.4] - - [64, 159, 400, 159] - - [444, 7016.51] + - [539, 7016.51] - - [1024, 4096, 1, 3584] - - [461, 9940.59] + - [556, 9940.59] - - [64, 135, 480, 134] - - [444, 6241.39] + - [539, 6241.39] - - [64, 99, 624, 99] - - [435, 5617.39] + - [530, 5617.39] - - [4096, 1024, 1, 2499] - - [460, 9813.57] + - [555, 9813.57] - - [1024, 1024, 1, 3942] - - [456, 9060.01] + - [551, 9060.01] - - [4096, 1024, 1, 3352] - - [460, 9867.12] + - [555, 9867.12] - - [1024, 4096, 1, 3543] - - [461, 9928.77] + - [556, 9928.77] - - [1024, 4096, 1, 3476] - - [460, 9931.58] + - [555, 9931.58] - - [1024, 33708, 1, 3822] - - [453, 10324.7] + - [548, 10324.7] - - [1024, 4096, 1, 3436] - - [453, 9917.28] + - [548, 9917.28] - - [1024, 1024, 1, 3861] - - [452, 8998.49] + - [547, 8998.49] - - [1024, 1024, 1, 4000] - - [457, 9058.3] + - [552, 9058.3] - - [1024, 4096, 1, 3594] - - [453, 9927.88] + - [548, 9927.88] - - [4096, 1024, 1, 3514] - - [461, 9872.3] + - [556, 9872.3] - - [1024, 4096, 1, 3064] - - [460, 9907.1] + - [555, 9907.1] - - [4096, 1024, 1, 3371] - - [453, 9857.74] + - [548, 9857.74] - - [4096, 1024, 1, 3558] - - [461, 9876.31] + - [556, 9876.31] - - [4096, 1024, 1, 3517] - - [460, 9866.45] + - [555, 9866.45] - - [4096, 1024, 1, 3144] - - [460, 9846.36] + - [555, 9846.36] - - [1024, 4096, 1, 3312] - - [453, 9932.85] + - [548, 9932.85] - - [4096, 1024, 1, 3079] - - [460, 9851.1] + - [555, 9851.1] - - [1024, 4096, 1, 3415] - - [453, 9919.47] + - [548, 9919.47] - - [1024, 4096, 1, 3221] - - [460, 9908.18] + - [555, 9908.18] - - [1024, 4096, 1, 3978] - - [454, 9944.41] + - [549, 9944.41] - - [4096, 1024, 1, 3876] - - [460, 9898.99] + - [555, 9898.99] - - [1024, 4096, 1, 3528] - - [453, 9919.6] + - [548, 9919.6] - - [1024, 4096, 1, 3181] - - [461, 9894.86] + - [556, 9894.86] - - [4096, 1024, 1, 3445] - - [460, 9878.54] + - [555, 9878.54] - - [4096, 1024, 1, 3450] - - [453, 9864.82] + - [548, 9864.82] - - [4096, 1024, 1, 3377] - - [460, 9879.69] + - [555, 9879.69] - - [1024, 4096, 1, 3532] - - [454, 9928.19] + - [549, 9928.19] - - [1024, 33708, 1, 3944] - - [453, 10329.7] + - [548, 10329.7] - - [4096, 1024, 1, 3483] - - [460, 9861.83] + - [555, 9861.83] - - [1024, 4096, 1, 3358] - - [453, 9903.69] + - [548, 9903.69] - - [4096, 1024, 1, 3464] - - [460, 9876.84] + - [555, 9876.84] - - [4096, 1024, 1, 3282] - - [453, 9859.23] + - [548, 9859.23] - - [4096, 1024, 1, 3256] - - [461, 9855.1] + - [556, 9855.1] - - [1024, 4096, 1, 3057] - - [460, 9910.75] + - [555, 9910.75] - - [4096, 1024, 1, 3481] - - [460, 9866.29] + - [555, 9866.29] - - [4096, 1024, 1, 3340] - - [460, 9862.25] + - [555, 9862.25] - - [1024, 1024, 1, 3870] - - [459, 9082.45] + - [554, 9082.45] - - [1024, 4096, 1, 3273] - - [453, 9916.29] + - [548, 9916.29] - - [64, 65, 992, 65] - - [444, 4683.01] + - [539, 4683.01] - - [4096, 1024, 1, 3392] - - [454, 9881.12] + - [549, 9881.12] - - [4096, 1024, 1, 3337] - - [460, 9864.5] + - [555, 9864.5] - - [4096, 1024, 1, 3359] - - [460, 9874.42] + - [555, 9874.42] - - [4096, 1024, 1, 3498] - - [461, 9864.35] + - [556, 9864.35] - - [4096, 1024, 1, 3169] - - [460, 9851.1] + - [555, 9851.1] - - [1024, 33708, 1, 3859] - - [454, 10332.6] + - [549, 10332.6] - - [64, 19, 3264, 19] - - [430, 2182.14] + - [525, 2182.14] - - [1024, 4096, 1, 3103] - - [453, 9898.9] + - [548, 9898.9] - - [4096, 1024, 1, 3900] - - [460, 9897.12] + - [555, 9897.12] - - [1024, 4096, 1, 3442] - - [460, 9938.97] + - [555, 9938.97] - - [1024, 4096, 1, 3248] - - [460, 9939.92] + - [555, 9939.92] - - [1024, 4096, 1, 3351] - - [461, 9923.23] + - [556, 9923.23] - - [4096, 1024, 1, 3593] - - [460, 9894.36] + - [555, 9894.36] - - [1024, 4096, 1, 3780] - - [460, 9941.96] + - [555, 9941.96] - - [64, 133, 480, 133] - - [444, 6180.79] + - [539, 6180.79] - - [1024, 33708, 1, 3681] - - [453, 10332.3] + - [548, 10332.3] - - [4096, 1024, 1, 3374] - - [454, 9859.36] + - [549, 9859.36] - - [1024, 4096, 1, 3557] - - [453, 9928.2] + - [548, 9928.2] - - [4096, 1024, 1, 3906] - - [460, 9907.07] + - [555, 9907.07] - - [4096, 1024, 1, 3504] - - [460, 9886.05] + - [555, 9886.05] - - [1024, 4096, 1, 3270] - - [460, 9916.37] + - [555, 9916.37] - - [4096, 1024, 1, 3098] - - [453, 9854.76] + - [548, 9854.76] - - [64, 232, 272, 232] - - [449, 7394.1] + - [544, 7394.1] - - [4096, 1024, 1, 3216] - - [461, 9876.57] + - [556, 9876.57] - - [64, 148, 432, 148] - - [446, 6663.85] + - [541, 6663.85] - - [1024, 4096, 1, 3550] - - [460, 9920.28] + - [555, 9920.28] - - [4096, 1024, 1, 3449] - - [454, 9870.57] + - [549, 9870.57] - - [1024, 4096, 1, 3403] - - [461, 9908.21] + - [556, 9908.21] - - [1024, 4096, 1, 3523] - - [460, 9932.71] + - [555, 9932.71] - - [1024, 4096, 1, 3486] - - [460, 9917.46] + - [555, 9917.46] - - [1024, 4096, 1, 3564] - - [460, 9923.44] + - [555, 9923.44] - - [1024, 33708, 1, 4005] - - [453, 10339.5] + - [548, 10339.5] - - [4096, 1024, 1, 3296] - - [460, 9879.78] + - [555, 9879.78] - - [1024, 4096, 1, 3263] - - [453, 9907.17] + - [548, 9907.17] - - [64, 25, 2512, 25] - - [430, 2848.17] + - [525, 2848.17] - - [1024, 4096, 1, 3130] - - [461, 9900.1] + - [556, 9900.1] - - [1024, 4096, 1, 3295] - - [461, 9895.45] + - [556, 9895.45] - - [1024, 33708, 1, 3925] - - [454, 10342.3] + - [549, 10342.3] - - [1024, 4096, 1, 3378] - - [453, 9921.37] + - [548, 9921.37] - - [4096, 1024, 1, 3720] - - [461, 9885.82] + - [556, 9885.82] - - [4096, 1024, 1, 3399] - - [460, 9880.65] + - [555, 9880.65] - - [4096, 1024, 1, 3543] - - [461, 9870.73] + - [556, 9870.73] - - [64, 9, 6544, 9] - - [433, 955.17] + - [528, 955.17] - - [4096, 1024, 1, 3497] - - [460, 9868.43] + - [555, 9868.43] - - [4096, 1024, 1, 3594] - - [461, 9876.88] + - [556, 9876.88] - - [1024, 4096, 1, 3144] - - [461, 9901.96] + - [556, 9901.96] - - [1024, 4096, 1, 3975] - - [454, 9950.19] + - [549, 9950.19] - - [4096, 1024, 1, 3205] - - [461, 9856.07] + - [556, 9856.07] - - [1024, 33708, 1, 3995] - - [453, 10331.1] + - [548, 10331.1] - - [1024, 4096, 1, 3392] - - [453, 9935.78] + - [548, 9935.78] - - [1024, 4096, 1, 3055] - - [461, 9893.25] + - [556, 9893.25] - - [1024, 4096, 1, 4026] - - [461, 9940.22] + - [556, 9940.22] - - [4096, 1024, 1, 3557] - - [460, 9884.0] + - [555, 9884.0] - - [4096, 1024, 1, 3515] - - [460, 9871.94] + - [555, 9871.94] - - [4096, 1024, 1, 3486] - - [461, 9860.74] + - [556, 9860.74] - - [4096, 1024, 1, 3457] - - [461, 9885.37] + - [556, 9885.37] - - [1024, 4096, 1, 3511] - - [453, 9928.24] + - [548, 9928.24] - - [4096, 1024, 1, 3138] - - [460, 9854.06] + - [555, 9854.06] - - [1024, 4096, 1, 3339] - - [454, 9912.89] + - [549, 9912.89] - - [1024, 4096, 1, 3939] - - [454, 9952.26] + - [549, 9952.26] - - [4096, 1024, 1, 3500] - - [454, 9863.62] + - [549, 9863.62] - - [4096, 1024, 1, 3395] - - [461, 9883.82] + - [556, 9883.82] - - [4096, 1024, 1, 3968] - - [461, 9920.36] + - [556, 9920.36] - - [4096, 1024, 1, 4020] - - [461, 9912.81] + - [556, 9912.81] - - [4096, 1024, 1, 3942] - - [460, 9910.17] + - [555, 9910.17] - - [1024, 1024, 1, 4032] - - [450, 9024.74] + - [545, 9024.74] - - [4096, 1024, 1, 3349] - - [461, 9866.04] + - [556, 9866.04] - - [1024, 4096, 1, 3322] - - [454, 9908.43] + - [549, 9908.43] - - [4096, 1024, 1, 3452] - - [460, 9872.69] + - [555, 9872.69] - - [1024, 4096, 1, 3417] - - [460, 9912.64] + - [555, 9912.64] - - [1024, 1024, 1, 4012] - - [458, 9085.47] + - [553, 9085.47] - - [1024, 4096, 1, 3526] - - [454, 9920.36] + - [549, 9920.36] - - [4096, 1024, 1, 3485] - - [454, 9861.64] + - [549, 9861.64] - - [1024, 1024, 1, 3681] - - [458, 8991.46] + - [553, 8991.46] - - [4096, 1024, 1, 3303] - - [461, 9861.3] + - [556, 9861.3] - - [4096, 1024, 1, 3344] - - [461, 9892.44] + - [556, 9892.44] - - [1024, 4096, 1, 3479] - - [461, 9921.77] + - [556, 9921.77] - - [4096, 1024, 1, 3300] - - [460, 9868.64] + - [555, 9868.64] - - [1024, 4096, 1, 3439] - - [454, 9918.29] + - [549, 9918.29] - - [4096, 1024, 1, 3280] - - [461, 9875.29] + - [556, 9875.29] - - [1024, 4096, 1, 3245] - - [453, 9910.49] + - [548, 9910.49] - - [1024, 4096, 1, 3328] - - [453, 9941.6] + - [548, 9941.6] - - [4096, 1024, 1, 3418] - - [453, 9870.76] + - [548, 9870.76] - - [1024, 4096, 1, 3493] - - [461, 9938.45] + - [556, 9938.45] - - [1024, 4096, 1, 3500] - - [453, 9916.93] + - [548, 9916.93] - - [1024, 4096, 1, 3166] - - [453, 9898.12] + - [548, 9898.12] - - [4096, 1024, 1, 3126] - - [454, 9847.04] + - [549, 9847.04] - - [1024, 4096, 1, 3277] - - [461, 9898.66] + - [556, 9898.66] - - [1024, 4096, 1, 3315] - - [460, 9923.11] + - [555, 9923.11] - - [1024, 1024, 1, 3927] - - [451, 8987.71] + - [546, 8987.71] - - [1024, 4096, 1, 3414] - - [453, 9916.01] + - [548, 9916.01] - - [4096, 1024, 1, 3531] - - [460, 9871.92] + - [555, 9871.92] - - [4096, 1024, 1, 3484] - - [453, 9867.86] + - [548, 9867.86] - - [1024, 4096, 1, 3180] - - [460, 9904.09] + - [555, 9904.09] - - [4096, 1024, 1, 3360] - - [460, 9879.57] + - [555, 9879.57] - - [1024, 33708, 1, 3990] - - [453, 10335.0] + - [548, 10335.0] - - [4096, 1024, 1, 3466] - - [460, 9875.02] + - [555, 9875.02] - - [1024, 4096, 1, 3428] - - [453, 9916.02] + - [548, 9916.02] - - [1024, 4096, 1, 3137] - - [460, 9913.27] + - [555, 9913.27] - - [4096, 1024, 1, 4059] - - [460, 9901.86] + - [555, 9901.86] - - [1024, 4096, 1, 3353] - - [460, 9914.6] + - [555, 9914.6] - - [1024, 4096, 1, 3942] - - [460, 9944.5] + - [555, 9944.5] - - [4096, 1024, 1, 3506] - - [453, 9875.75] + - [548, 9875.75] - - [1024, 1024, 1, 3894] - - [451, 8946.55] + - [546, 8946.55] - - [4096, 1024, 1, 3508] - - [461, 9877.67] + - [556, 9877.67] - - [64, 132, 480, 135] - - [444, 6164.86] + - [539, 6164.86] - - [4096, 1024, 1, 3956] - - [453, 9907.83] + - [548, 9907.83] - - [64, 7, 8192, 7] - - [432, 813.078] + - [527, 813.078] - - [1024, 4096, 1, 3272] - - [454, 9909.82] + - [549, 9909.82] - - [1024, 4096, 1, 3443] - - [461, 9929.83] + - [556, 9929.83] - - [1024, 4096, 1, 3375] - - [461, 9909.23] + - [556, 9909.23] - - [1024, 4096, 1, 3525] - - [461, 9929.27] + - [556, 9929.27] - - [4096, 1024, 1, 3472] - - [460, 9889.97] + - [555, 9889.97] - - [1024, 4096, 1, 3520] - - [453, 9947.79] + - [548, 9947.79] - - [4096, 1024, 1, 3322] - - [460, 9862.98] + - [555, 9862.98] - - [4096, 1024, 1, 3387] - - [460, 9861.62] + - [555, 9861.62] - - [64, 8, 7280, 8] - - [438, 1024.1] + - [533, 1024.1] - - [1024, 33708, 1, 3939] - - [453, 10339.9] + - [548, 10339.9] - - [4096, 1024, 1, 3345] - - [461, 9873.68] + - [556, 9873.68] - - [4096, 1024, 1, 2967] - - [460, 9839.21] + - [555, 9839.21] - - [1024, 4096, 1, 3453] - - [453, 9905.81] + - [548, 9905.81] - - [1024, 4096, 1, 3640] - - [460, 9934.05] + - [555, 9934.05] - - [4096, 1024, 1, 3291] - - [454, 9860.84] + - [549, 9860.84] - - [1024, 4096, 1, 3350] - - [461, 9918.03] + - [556, 9918.03] - - [4096, 1024, 1, 3417] - - [460, 9864.61] + - [555, 9864.61] - - [64, 135, 480, 135] - - [444, 6265.45] + - [539, 6265.45] - - [1024, 4096, 1, 3467] - - [454, 9906.95] + - [549, 9906.95] - - [1024, 4096, 1, 3491] - - [460, 9933.3] + - [555, 9933.3] - - [1024, 4096, 1, 3822] - - [460, 9938.75] + - [555, 9938.75] - - [4096, 1024, 1, 3292] - - [460, 9849.21] + - [555, 9849.21] - - [1024, 4096, 1, 3231] - - [453, 9905.82] + - [548, 9905.82] - - [1024, 4096, 1, 3364] - - [454, 9930.32] + - [549, 9930.32] - - [1024, 4096, 1, 3995] - - [454, 9943.76] + - [549, 9943.76] - - [1024, 4096, 1, 3545] - - [453, 9928.53] + - [548, 9928.53] - - [1024, 1024, 1, 3876] - - [451, 9003.04] + - [546, 9003.04] - - [1024, 4096, 1, 3186] - - [453, 9921.01] + - [548, 9921.01] - - [4096, 1024, 1, 3432] - - [460, 9875.29] + - [555, 9875.29] - - [64, 84, 752, 85] - - [431, 5704.51] + - [526, 5704.51] - - [4096, 1024, 1, 3367] - - [454, 9868.06] + - [549, 9868.06] - - [4096, 1024, 1, 3503] - - [461, 9871.01] + - [556, 9871.01] - - [1024, 4096, 1, 3095] - - [454, 9902.9] + - [549, 9902.9] - - [4096, 1024, 1, 3465] - - [461, 9872.17] + - [556, 9872.17] - - [1024, 4096, 1, 3402] - - [460, 9914.66] + - [555, 9914.66] - - [4096, 1024, 1, 3140] - - [460, 9847.95] + - [555, 9847.95] - - [1024, 1024, 1, 4050] - - [457, 9055.75] + - [552, 9055.75] - - [4096, 1024, 1, 3424] - - [454, 9894.62] + - [549, 9894.62] - - [4096, 1024, 1, 3257] - - [453, 9860.97] + - [548, 9860.97] - - [4096, 1024, 1, 2917] - - [460, 9845.91] + - [555, 9845.91] - - [1024, 33708, 1, 3640] - - [453, 10321.7] + - [548, 10321.7] - - [1024, 4096, 1, 3456] - - [453, 9950.35] + - [548, 9950.35] - - [1024, 4096, 1, 3014] - - [453, 9907.97] + - [548, 9907.97] - - [4096, 1024, 1, 3372] - - [461, 9868.37] + - [556, 9868.37] - - [64, 132, 480, 132] - - [444, 6121.62] + - [539, 6121.62] - - [1024, 4096, 1, 3294] - - [461, 9903.23] + - [556, 9903.23] - - [4096, 1024, 1, 3446] - - [461, 9871.69] + - [556, 9871.69] - - [1024, 4096, 1, 3389] - - [454, 9909.27] + - [549, 9909.27] - - [4096, 1024, 1, 3259] - - [460, 9860.76] + - [555, 9860.76] - - [4096, 1024, 1, 3544] - - [460, 9878.76] + - [555, 9878.76] - - [4096, 1024, 1, 3479] - - [461, 9873.97] + - [556, 9873.97] - - [4096, 1024, 1, 3542] - - [460, 9878.97] + - [555, 9878.97] - - [4096, 1024, 1, 3321] - - [453, 9861.13] + - [548, 9861.13] - - [1024, 4096, 1, 3147] - - [453, 9894.77] + - [548, 9894.77] - - [1024, 4096, 1, 3944] - - [453, 9950.51] + - [548, 9950.51] - - [4096, 1024, 1, 3870] - - [461, 9881.74] + - [556, 9881.74] - - [1024, 4096, 1, 3308] - - [453, 9907.26] + - [548, 9907.26] - - [4096, 1024, 1, 3401] - - [460, 9864.59] + - [555, 9864.59] - - [1024, 4096, 1, 3395] - - [453, 9929.03] + - [548, 9929.03] - - [64, 99, 624, 102] - - [429, 5651.36] + - [524, 5651.36] - - [1024, 4096, 1, 3563] - - [460, 9922.76] + - [555, 9922.76] - - [1024, 33708, 1, 3870] - - [453, 10325.4] + - [548, 10325.4] - - [4096, 1024, 1, 3494] - - [460, 9875.37] + - [555, 9875.37] - - [1024, 4096, 1, 3271] - - [453, 9913.09] + - [548, 9913.09] - - [1024, 33708, 1, 3910] - - [453, 10341.5] + - [548, 10341.5] - - [1024, 4096, 1, 3287] - - [461, 9924.87] + - [556, 9924.87] - - [1024, 33708, 1, 3860] - - [453, 10330.7] + - [548, 10330.7] - - [64, 143, 432, 148] - - [446, 6571.78] + - [541, 6571.78] - - [1024, 1024, 1, 3584] - - [458, 8975.31] + - [553, 8975.31] - - [64, 162, 400, 162] - - [448, 6822.26] + - [543, 6822.26] - - [4096, 1024, 1, 3341] - - [460, 9854.66] + - [555, 9854.66] - - [1024, 4096, 1, 3136] - - [453, 9926.86] + - [548, 9926.86] - - [4096, 1024, 1, 3439] - - [460, 9854.33] + - [555, 9854.33] - - [64, 148, 432, 147] - - [444, 6677.61] + - [539, 6677.61] - - [1024, 4096, 1, 3751] - - [460, 9938.48] + - [555, 9938.48] - - [1024, 4096, 1, 3301] - - [460, 9919.15] + - [555, 9919.15] - - [4096, 1024, 1, 3468] - - [461, 9859.83] + - [556, 9859.83] - - [1024, 4096, 1, 3416] - - [461, 9918.52] + - [556, 9918.52] - - [4096, 1024, 1, 3163] - - [460, 9854.65] + - [555, 9854.65] - - [1024, 4096, 1, 3230] - - [454, 9897.54] + - [549, 9897.54] - - [1024, 4096, 1, 3581] - - [454, 9915.48] + - [549, 9915.48] - - [1024, 1024, 1, 3960] - - [456, 9045.86] + - [551, 9045.86] - - [4096, 1024, 1, 3463] - - [461, 9884.74] + - [556, 9884.74] - - [1024, 4096, 1, 3478] - - [454, 9927.02] + - [549, 9927.02] - - [4096, 1024, 1, 3262] - - [460, 9852.22] + - [555, 9852.22] - - [1024, 4096, 1, 3438] - - [460, 9912.68] + - [555, 9912.68] - - [1024, 4096, 1, 3244] - - [453, 9900.51] + - [548, 9900.51] - - [1024, 4096, 1, 3445] - - [453, 9920.32] + - [548, 9920.32] - - [4096, 1024, 1, 3328] - - [460, 9888.07] + - [555, 9888.07] - - [1024, 4096, 1, 3492] - - [454, 9937.22] + - [549, 9937.22] - - [4096, 1024, 1, 3211] - - [454, 9847.95] + - [549, 9847.95] - - [1024, 4096, 1, 3910] - - [461, 9946.57] + - [556, 9946.57] - - [1024, 4096, 1, 3314] - - [453, 9932.6] + - [548, 9932.6] - - [4096, 1024, 1, 3859] - - [460, 9902.84] + - [555, 9902.84] - - [4096, 1024, 1, 3383] - - [460, 9875.2] + - [555, 9875.2] - - [1024, 4096, 1, 3409] - - [461, 9926.79] + - [556, 9926.79] - - [1024, 4096, 1, 4020] - - [453, 9941.8] + - [548, 9941.8] - - [4096, 1024, 1, 3530] - - [460, 9872.81] + - [555, 9872.81] - - [4096, 1024, 1, 3411] - - [461, 9875.02] + - [556, 9875.02] - - [1024, 4096, 1, 3566] - - [461, 9921.1] + - [556, 9921.1] - - [4096, 1024, 1, 3493] - - [453, 9875.74] + - [548, 9875.74] - - [4096, 1024, 1, 3184] - - [460, 9873.14] + - [555, 9873.14] - - [1024, 4096, 1, 3072] - - [453, 9923.79] + - [548, 9923.79] - - [1024, 4096, 1, 3431] - - [454, 9911.03] + - [549, 9911.03] - - [4096, 1024, 1, 3306] - - [461, 9853.42] + - [556, 9853.42] - - [1024, 4096, 1, 3352] - - [461, 9913.32] + - [556, 9913.32] - - [4096, 1024, 1, 3295] - - [460, 9862.68] + - [555, 9862.68] - - [64, 123, 528, 122] - - [426, 6950.25] + - [521, 6950.25] - - [1024, 4096, 1, 3517] - - [454, 9920.06] + - [549, 9920.06] - - [64, 102, 624, 101] - - [434, 5791.49] + - [529, 5791.49] - - [4096, 1024, 1, 3426] - - [460, 9891.14] + - [555, 9891.14] - - [4096, 1024, 1, 3385] - - [460, 9868.41] + - [555, 9868.41] - - [1024, 1024, 1, 3978] - - [451, 9008.48] + - [546, 9008.48] - - [4096, 1024, 1, 3572] - - [453, 9884.81] + - [548, 9884.81] - - [4096, 1024, 1, 3459] - - [460, 9892.17] + - [555, 9892.17] - - [1024, 4096, 1, 3374] - - [461, 9908.52] + - [556, 9908.52] - - [4096, 1024, 1, 3166] - - [460, 9832.45] + - [555, 9832.45] - - [4096, 1024, 1, 3093] - - [461, 9841.25] + - [556, 9841.25] - - [4096, 1024, 1, 3523] - - [454, 9879.05] + - [549, 9879.05] - - [4096, 1024, 1, 3413] - - [454, 9880.81] + - [549, 9880.81] - - [1024, 4096, 1, 3996] - - [453, 9948.14] + - [548, 9948.14] - - [1024, 4096, 1, 3452] - - [461, 9915.97] + - [556, 9915.97] - - [4096, 1024, 1, 3232] - - [461, 9876.54] + - [556, 9876.54] - - [4096, 1024, 1, 3400] - - [453, 9867.15] + - [548, 9867.15] - - [4096, 1024, 1, 3334] - - [460, 9868.99] + - [555, 9868.99] - - [1024, 4096, 1, 3345] - - [453, 9920.6] + - [548, 9920.6] - - [1024, 4096, 1, 3538] - - [460, 9933.34] + - [555, 9933.34] - - [1024, 4096, 1, 3466] - - [460, 9920.85] + - [555, 9920.85] - - [4096, 1024, 1, 3315] - - [460, 9876.87] + - [555, 9876.87] - - [4096, 1024, 1, 3214] - - [461, 9847.93] + - [556, 9847.93] - - [1024, 33708, 1, 3900] - - [453, 10331.7] + - [548, 10331.7] - - [64, 160, 400, 160] - - [446, 7440.61] + - [541, 7440.61] - - [1024, 4096, 1, 3367] - - [460, 9926.32] + - [555, 9926.32] - - [1024, 4096, 1, 2917] - - [461, 9904.57] + - [556, 9904.57] - - [1024, 1024, 1, 3995] - - [452, 9000.33] + - [547, 9000.33] - - [64, 132, 480, 134] - - [444, 6146.88] + - [539, 6146.88] - - [1024, 4096, 1, 3544] - - [461, 9924.14] + - [556, 9924.14] - - [4096, 1024, 1, 3414] - - [461, 9867.9] + - [556, 9867.9] - - [4096, 1024, 1, 3565] - - [454, 9870.13] + - [549, 9870.13] - - [1024, 4096, 1, 3512] - - [460, 9919.84] + - [555, 9919.84] - - [1024, 4096, 1, 3191] - - [461, 9914.79] + - [556, 9914.79] - - [64, 27, 2336, 27] - - [428, 3054.71] + - [523, 3054.71] - - [1024, 4096, 1, 3289] - - [461, 9917.2] + - [556, 9917.2] - - [4096, 1024, 1, 3290] - - [460, 9858.41] + - [555, 9858.41] - - [1024, 4096, 1, 3211] - - [461, 9897.16] + - [556, 9897.16] - - [1024, 33708, 1, 3969] - - [454, 10336.1] + - [549, 10336.1] - - [4096, 1024, 1, 3566] - - [460, 9863.0] + - [555, 9863.0] - - [64, 111, 576, 111] - - [434, 6400.91] + - [529, 6400.91] - - [1024, 4096, 1, 3459] - - [460, 9923.03] + - [555, 9923.03] - - [1024, 4096, 1, 3372] - - [453, 9909.86] + - [548, 9909.86] - - [4096, 1024, 1, 3339] - - [460, 9859.3] + - [555, 9859.3] - - [4096, 1024, 1, 3425] - - [460, 9889.34] + - [555, 9889.34] - - [4096, 1024, 1, 3388] - - [460, 9871.67] + - [555, 9871.67] - - [1024, 4096, 1, 3531] - - [453, 9919.0] + - [548, 9919.0] - - [4096, 1024, 1, 3286] - - [461, 9868.42] + - [556, 9868.42] - - [4096, 1024, 1, 3462] - - [460, 9881.88] + - [555, 9881.88] - - [1024, 4096, 1, 3388] - - [453, 9904.69] + - [548, 9904.69] - - [4096, 1024, 1, 3165] - - [453, 9836.33] + - [548, 9836.33] - - [4096, 1024, 1, 3304] - - [460, 9857.55] + - [555, 9857.55] - - [1024, 4096, 1, 2736] - - [460, 9901.07] + - [555, 9901.07] - - [4096, 1024, 1, 3397] - - [460, 9872.1] + - [555, 9872.1] - - [64, 38, 1680, 38] - - [427, 3459.52] + - [522, 3459.52] - - [1024, 4096, 1, 3311] - - [461, 9908.32] + - [556, 9908.32] - - [1024, 4096, 1, 3394] - - [461, 9929.43] + - [556, 9929.43] - - [4096, 1024, 1, 2736] - - [460, 9833.88] + - [555, 9833.88] - - [1024, 4096, 1, 3559] - - [454, 9925.33] + - [549, 9925.33] - - [4096, 1024, 1, 3180] - - [460, 9838.05] + - [555, 9838.05] - - [1024, 4096, 1, 3480] - - [453, 9922.46] + - [548, 9922.46] - - [4096, 1024, 1, 3318] - - [460, 9867.87] + - [555, 9867.87] - - [4096, 1024, 1, 3213] - - [460, 9846.02] + - [555, 9846.02] - - [1024, 4096, 1, 3286] - - [460, 9912.14] + - [555, 9912.14] - - [4096, 1024, 1, 3471] - - [460, 9874.24] + - [555, 9874.24] - - [1024, 4096, 1, 3381] - - [461, 9922.96] + - [556, 9922.96] - - [64, 100, 624, 100] - - [435, 5705.24] + - [530, 5705.24] - - [4096, 1024, 1, 3502] - - [460, 9872.44] + - [555, 9872.44] - - [64, 16, 3840, 16] - - [441, 2091.67] + - [536, 2091.67] - - [1024, 4096, 1, 3552] - - [453, 9943.89] + - [548, 9943.89] - - [4096, 1024, 1, 3519] - - [461, 9869.95] + - [556, 9869.95] - - [1024, 4096, 1, 3300] - - [454, 9916.15] + - [549, 9916.15] - - [1024, 4096, 1, 3419] - - [453, 9914.06] + - [548, 9914.06] - - [4096, 1024, 1, 4030] - - [454, 9893.73] + - [549, 9893.73] - - [4096, 1024, 1, 3976] - - [461, 9898.35] + - [556, 9898.35] - - [1024, 4096, 1, 3473] - - [461, 9928.42] + - [556, 9928.42] - - [1024, 1024, 1, 3977] - - [458, 9009.33] + - [553, 9009.33] - - [4096, 1024, 1, 3428] - - [460, 9876.79] + - [555, 9876.79] - - [1024, 4096, 1, 3433] - - [454, 9923.92] + - [549, 9923.92] - - [4096, 1024, 1, 3534] - - [454, 9864.0] + - [549, 9864.0] - - [4096, 1024, 1, 3461] - - [460, 9873.12] + - [555, 9873.12] - - [4096, 1024, 1, 3681] - - [460, 9898.57] + - [555, 9898.57] - - [4096, 1024, 1, 3495] - - [461, 9876.08] + - [556, 9876.08] - - [4096, 1024, 1, 3351] - - [460, 9879.71] + - [555, 9879.71] - - [1024, 4096, 1, 4059] - - [453, 9948.61] + - [548, 9948.61] - - [4096, 1024, 1, 3990] - - [460, 9900.76] + - [555, 9900.76] - - [1024, 4096, 1, 3325] - - [454, 9903.3] + - [549, 9903.3] - - [1024, 4096, 1, 3408] - - [460, 9932.15] + - [555, 9932.15] - - [64, 59, 1088, 59] - - [434, 5343.77] + - [529, 5343.77] - - [4096, 1024, 1, 3394] - - [461, 9878.17] + - [556, 9878.17] - - [1024, 4096, 1, 3573] - - [461, 9935.3] + - [556, 9935.3] - - [4096, 1024, 1, 3386] - - [460, 9866.38] + - [555, 9866.38] - - [4096, 1024, 1, 3540] - - [460, 9882.33] + - [555, 9882.33] - - [1024, 4096, 1, 3182] - - [454, 9894.45] + - [549, 9894.45] - - [1024, 4096, 1, 3430] - - [453, 9915.24] + - [548, 9915.24] - - [1024, 4096, 1, 3236] - - [461, 9920.56] + - [556, 9920.56] - - [4096, 1024, 1, 2977] - - [460, 9848.08] + - [555, 9848.08] - - [1024, 4096, 1, 3355] - - [460, 9908.78] + - [555, 9908.78] - - [4096, 1024, 1, 3139] - - [460, 9850.71] + - [555, 9850.71] - - [4096, 1024, 1, 3516] - - [454, 9874.21] + - [549, 9874.21] - - [4096, 1024, 1, 3368] - - [454, 9872.64] + - [549, 9872.64] - - [4096, 1024, 1, 3559] - - [453, 9884.32] + - [548, 9884.32] - - [64, 11, 5456, 11] - - [441, 1382.67] + - [536, 1382.67] - - [1024, 4096, 1, 3506] - - [460, 9937.69] + - [555, 9937.69] - - [1024, 4096, 1, 3145] - - [453, 9905.11] + - [548, 9905.11] - - [1024, 4096, 1, 3369] - - [460, 9912.71] + - [555, 9912.71] - - [64, 112, 576, 112] - - [426, 6583.56] + - [521, 6583.56] - - [4096, 1024, 1, 3522] - - [460, 9889.47] + - [555, 9889.47] - - [1024, 33708, 1, 3894] - - [453, 10337.5] + - [548, 10337.5] - - [64, 159, 400, 162] - - [444, 7057.09] + - [539, 7057.09] - - [4096, 1024, 1, 3336] - - [460, 9867.67] + - [555, 9867.67] - - [1024, 4096, 1, 3382] - - [454, 9915.9] + - [549, 9915.9] - - [4096, 1024, 1, 3533] - - [460, 9878.56] + - [555, 9878.56] - - [4096, 1024, 1, 4050] - - [461, 9916.82] + - [556, 9916.82] - - [4096, 1024, 1, 3480] - - [454, 9869.32] + - [549, 9869.32] - - [1024, 4096, 1, 3344] - - [453, 9935.61] + - [548, 9935.61] - - [64, 122, 528, 122] - - [426, 6871.14] + - [521, 6871.14] - - [1024, 4096, 1, 3509] - - [454, 9925.8] + - [549, 9925.8] - - [1024, 4096, 1, 3956] - - [453, 9958.26] + - [548, 9958.26] - - [4096, 1024, 1, 3616] - - [460, 9904.63] + - [555, 9904.63] - - [1024, 4096, 1, 3366] - - [453, 9919.47] + - [548, 9919.47] - - [4096, 1024, 1, 2935] - - [453, 9833.23] + - [548, 9833.23] - - [4096, 1024, 1, 3393] - - [460, 9877.45] + - [555, 9877.45] - - [4096, 1024, 1, 3547] - - [454, 9865.1] + - [549, 9865.1] - - [1024, 4096, 1, 3499] - - [461, 9912.49] + - [556, 9912.49] - - [4096, 1024, 1, 3357] - - [460, 9855.28] + - [555, 9855.28] - - [4096, 1024, 1, 3272] - - [460, 9861.97] + - [555, 9861.97] - - [4096, 1024, 1, 3207] - - [460, 9847.78] + - [555, 9847.78] - - [4096, 1024, 1, 3894] - - [460, 9918.86] + - [555, 9918.86] - - [1024, 4096, 1, 3444] - - [460, 9932.71] + - [555, 9932.71] - - [4096, 1024, 1, 3561] - - [460, 9872.61] + - [555, 9872.61] - - [4096, 1024, 1, 3376] - - [460, 9885.59] + - [555, 9885.59] - - [1024, 4096, 1, 3458] - - [460, 9929.39] + - [555, 9929.39] - - [4096, 1024, 1, 3231] - - [454, 9847.08] + - [549, 9847.08] - - [64, 228, 272, 228] - - [455, 7302.69] + - [550, 7302.69] - - [1024, 4096, 1, 3505] - - [461, 9931.63] + - [556, 9931.63] - - [4096, 1024, 1, 3277] - - [460, 9857.2] + - [555, 9857.2] - - [64, 21, 2976, 21] - - [430, 2436.14] + - [525, 2436.14] - - [1024, 4096, 1, 3391] - - [460, 9911.25] + - [555, 9911.25] - - [64, 32, 1984, 32] - - [442, 3572.17] + - [537, 3572.17] - - [1024, 4096, 1, 3536] - - [461, 9946.9] + - [556, 9946.9] - - [1024, 4096, 1, 3063] - - [460, 9906.92] + - [555, 9906.92] - - [1024, 1024, 1, 3925] - - [452, 9011.45] + - [547, 9011.45] - - [1024, 4096, 1, 3189] - - [454, 9900.95] + - [549, 9900.95] - - [1024, 4096, 1, 2505] - - [460, 9854.85] + - [555, 9854.85] - - [4096, 1024, 1, 3454] - - [453, 9864.96] + - [548, 9864.96] - - [1024, 4096, 1, 3405] - - [461, 9906.33] + - [556, 9906.33] - - [1024, 33708, 1, 4050] - - [454, 10343.7] + - [549, 10343.7] - - [4096, 1024, 1, 3520] - - [460, 9887.03] + - [555, 9887.03] - - [64, 93, 688, 93] - - [437, 6222.86] + - [532, 6222.86] - - [1024, 4096, 1, 3487] - - [461, 9918.69] + - [556, 9918.69] - - [1024, 4096, 1, 3558] - - [461, 9930.99] + - [556, 9930.99] - - [4096, 1024, 1, 3297] - - [460, 9874.31] + - [555, 9874.31] - - [1024, 1024, 1, 3840] - - [456, 9075.42] + - [551, 9075.42] - - [1024, 4096, 1, 3483] - - [460, 9915.38] + - [555, 9915.38] - - [1024, 1024, 1, 3956] - - [459, 9010.03] + - [554, 9010.03] - - [1024, 33708, 1, 3751] - - [454, 10325.9] + - [549, 10325.9] - - [4096, 1024, 1, 3380] - - [460, 9888.47] + - [555, 9888.47] - - [1024, 4096, 1, 3380] - - [453, 9927.25] + - [548, 9927.25] - - [1024, 4096, 1, 3396] - - [461, 9931.96] + - [556, 9931.96] - - [1024, 4096, 1, 3497] - - [454, 9914.86] + - [549, 9914.86] - - [1024, 4096, 1, 3502] - - [461, 9921.52] + - [556, 9921.52] - - [1024, 1024, 1, 3976] - - [456, 9060.3] + - [551, 9060.3] - - [1024, 4096, 1, 3138] - - [454, 9908.66] + - [549, 9908.66] - - [4096, 1024, 1, 3939] - - [453, 9910.23] + - [548, 9910.23] - - [1024, 4096, 1, 3303] - - [454, 9916.64] + - [549, 9916.64] - - [64, 111, 576, 112] - - [434, 6495.19] + - [529, 6495.19] - - [1024, 4096, 1, 3418] - - [460, 9913.35] + - [555, 9913.35] - - [1024, 4096, 1, 3224] - - [454, 9904.05] + - [549, 9904.05] - - [4096, 1024, 1, 3978] - - [460, 9896.28] + - [555, 9896.28] - - [1024, 4096, 1, 3472] - - [453, 9937.48] + - [548, 9937.48] - - [4096, 1024, 1, 3353] - - [461, 9863.97] + - [556, 9863.97] - - [4096, 1024, 1, 3362] - - [460, 9871.06] + - [555, 9871.06] - - [1024, 33708, 1, 3978] - - [453, 10325.4] + - [548, 10325.4] - - [64, 100, 624, 102] - - [429, 5695.67] + - [524, 5695.67] - - [1024, 4096, 1, 3432] - - [461, 9915.56] + - [556, 9915.56] - - [1024, 4096, 1, 3139] - - [460, 9914.21] + - [555, 9914.21] - - [1024, 4096, 1, 3341] - - [461, 9912.1] + - [556, 9912.1] - - [1024, 4096, 1, 3494] - - [454, 9924.6] + - [549, 9924.6] - - [1024, 4096, 1, 3969] - - [453, 9952.28] + - [548, 9952.28] - - [1024, 4096, 1, 3163] - - [461, 9911.79] + - [556, 9911.79] - - [1024, 1024, 1, 3955] - - [451, 9097.86] + - [546, 9097.86] - - [4096, 1024, 1, 3405] - - [460, 9853.84] + - [555, 9853.84] - - [1024, 1024, 1, 4030] - - [451, 9083.86] + - [546, 9083.86] - - [4096, 1024, 1, 3453] - - [460, 9858.88] + - [555, 9858.88] - - [1024, 4096, 1, 3411] - - [461, 9926.54] + - [556, 9926.54] - - [1024, 4096, 1, 3527] - - [454, 9922.65] + - [549, 9922.65] - - [4096, 1024, 1, 3474] - - [460, 9878.49] + - [555, 9878.49] - - [1024, 4096, 1, 3572] - - [460, 9932.0] + - [555, 9932.0] - - [4096, 1024, 1, 3293] - - [460, 9848.26] + - [555, 9848.26] - - [4096, 1024, 1, 3247] - - [460, 9861.45] + - [555, 9861.45] - - [64, 15, 4096, 15] - - [441, 1955.75] + - [536, 1955.75] - - [1024, 4096, 1, 3425] - - [461, 9936.4] + - [556, 9936.4] - - [1024, 4096, 1, 3354] - - [453, 9917.55] + - [548, 9917.55] - - [4096, 1024, 1, 3382] - - [460, 9885.49] + - [555, 9885.49] - - [4096, 1024, 1, 3236] - - [460, 9860.6] + - [555, 9860.6] - - [1024, 4096, 1, 3519] - - [461, 9919.3] + - [556, 9919.3] - - [4096, 1024, 1, 3354] - - [460, 9854.75] + - [555, 9854.75] - - [4096, 1024, 1, 3501] - - [461, 9869.62] + - [556, 9869.62] - - [1024, 1024, 1, 3906] - - [459, 9104.99] + - [554, 9104.99] - - [4096, 1024, 1, 3266] - - [460, 9873.97] + - [555, 9873.97] - - [64, 101, 624, 102] - - [429, 5765.52] + - [524, 5765.52] - - [1024, 4096, 1, 3368] - - [460, 9909.77] + - [555, 9909.77] - - [1024, 4096, 1, 4030] - - [461, 9940.27] + - [556, 9940.27] - - [1024, 4096, 1, 3533] - - [454, 9916.64] + - [549, 9916.64] - - [4096, 1024, 1, 3332] - - [461, 9876.45] + - [556, 9876.45] - - [4096, 1024, 1, 3584] - - [460, 9896.6] + - [555, 9896.6] - - [1024, 4096, 1, 3616] - - [460, 9957.18] + - [555, 9957.18] - - [4096, 1024, 1, 3265] - - [460, 9877.78] + - [555, 9877.78] - - [4096, 1024, 1, 3361] - - [460, 9888.61] + - [555, 9888.61] - - [4096, 1024, 1, 3467] - - [460, 9863.4] + - [555, 9863.4] - - [1024, 4096, 1, 3454] - - [454, 9904.89] + - [549, 9904.89] - - [1024, 4096, 1, 3101] - - [461, 9893.12] + - [556, 9893.12] - - [1024, 4096, 1, 3508] - - [461, 9931.54] + - [556, 9931.54] - - [4096, 1024, 1, 3267] - - [460, 9864.48] + - [555, 9864.48] - - [64, 54, 1184, 54] - - [426, 4906.02] + - [521, 4906.02] - - [4096, 1024, 1, 3419] - - [460, 9872.56] + - [555, 9872.56] - - [4096, 1024, 1, 3822] - - [460, 9892.63] + - [555, 9892.63] - - [1024, 4096, 1, 3266] - - [460, 9918.58] + - [555, 9918.58] - - [4096, 1024, 1, 3440] - - [461, 9890.16] + - [556, 9890.16] - - [1024, 4096, 1, 3361] - - [460, 9930.97] + - [555, 9930.97] - - [1024, 4096, 1, 3546] - - [454, 9926.56] + - [549, 9926.56] - - [4096, 1024, 1, 3473] - - [460, 9889.06] + - [555, 9889.06] - - [4096, 1024, 1, 3546] - - [461, 9872.27] + - [556, 9872.27] - - [1024, 4096, 1, 3088] - - [454, 9918.03] + - [549, 9918.03] - - [1024, 4096, 1, 3535] - - [461, 9921.2] + - [556, 9921.2] - - [1024, 4096, 1, 3447] - - [461, 9920.63] + - [556, 9920.63] - - [1024, 4096, 1, 3560] - - [460, 9925.48] + - [555, 9925.48] - - [1024, 4096, 1, 3422] - - [454, 9922.21] + - [549, 9922.21] - - [1024, 4096, 1, 3469] - - [453, 9906.18] + - [548, 9906.18] - - [4096, 1024, 1, 3488] - - [460, 9903.26] + - [555, 9903.26] - - [1024, 4096, 1, 3110] - - [460, 9906.76] + - [555, 9906.76] - - [1024, 4096, 1, 3265] - - [461, 9916.69] + - [556, 9916.69] - - [1024, 4096, 1, 3291] - - [460, 9902.73] + - [555, 9902.73] - - [1024, 4096, 1, 3390] - - [461, 9907.22] + - [556, 9907.22] - - [4096, 1024, 1, 3046] - - [460, 9847.68] + - [555, 9847.68] - - [1024, 4096, 1, 3539] - - [461, 9933.49] + - [556, 9933.49] - - [4096, 1024, 1, 3221] - - [461, 9860.74] + - [556, 9860.74] - - [4096, 1024, 1, 3433] - - [460, 9872.74] + - [555, 9872.74] - - [4096, 1024, 1, 3364] - - [461, 9881.91] + - [556, 9881.91] - - [4096, 1024, 1, 3470] - - [460, 9858.56] + - [555, 9858.56] - - [1024, 4096, 1, 3404] - - [453, 9907.27] + - [548, 9907.27] - - [1024, 33708, 1, 3968] - - [454, 10350.3] + - [549, 10350.3] - - [4096, 1024, 1, 3088] - - [460, 9869.06] + - [555, 9869.06] - - [1024, 4096, 1, 3247] - - [460, 9901.02] + - [555, 9901.02] - - [1024, 33708, 1, 3996] - - [453, 10328.5] + - [548, 10328.5] - - [4096, 1024, 1, 3482] - - [461, 9866.99] + - [556, 9866.99] - - [1024, 1024, 1, 3796] - - [456, 9031.68] + - [551, 9031.68] - - [4096, 1024, 1, 3995] - - [461, 9896.78] + - [556, 9896.78] - - [1024, 1024, 1, 3859] - - [458, 9097.36] + - [553, 9097.36] - - [1024, 4096, 1, 3280] - - [454, 9934.05] + - [549, 9934.05] - - [4096, 1024, 1, 3271] - - [461, 9860.09] + - [556, 9860.09] - - [64, 10, 5952, 10] - - [441, 1221.02] + - [536, 1221.02] - - [4096, 1024, 1, 3545] - - [460, 9877.35] + - [555, 9877.35] - - [4096, 1024, 1, 3476] - - [453, 9882.57] + - [548, 9882.57] - - [4096, 1024, 1, 3496] - - [454, 9880.5] + - [549, 9880.5] - - [4096, 1024, 1, 3191] - - [454, 9858.7] + - [549, 9858.7] - - [4096, 1024, 1, 3311] - - [461, 9853.2] + - [556, 9853.2] - - [1024, 4096, 1, 3302] - - [461, 9919.32] + - [556, 9919.32] - - [1024, 4096, 1, 3681] - - [460, 9944.99] + - [555, 9944.99] - - [4096, 1024, 1, 3582] - - [453, 9869.77] + - [548, 9869.77] - - [4096, 1024, 1, 3421] - - [461, 9856.08] + - [556, 9856.08] - - [4096, 1024, 1, 3560] - - [454, 9884.48] + - [549, 9884.48] - - [1024, 4096, 1, 3495] - - [461, 9930.13] + - [556, 9930.13] - - [4096, 1024, 1, 3186] - - [460, 9870.59] + - [555, 9870.59] - - [4096, 1024, 1, 3925] - - [460, 9904.0] + - [555, 9904.0] - - [64, 71, 896, 71] - - [445, 5004.79] + - [540, 5004.79] - - [1024, 4096, 1, 3435] - - [461, 9916.58] + - [556, 9916.58] - - [4096, 1024, 1, 3434] - - [460, 9871.29] + - [555, 9871.29] - - [1024, 33708, 1, 4012] - - [453, 10332.5] + - [548, 10332.5] - - [1024, 4096, 1, 3340] - - [453, 9918.11] + - [548, 9918.11] - - [1024, 1024, 1, 3860] - - [451, 8999.36] + - [546, 8999.36] - - [4096, 1024, 1, 3489] - - [460, 9882.02] + - [555, 9882.02] - - [1024, 4096, 1, 3162] - - [461, 9906.28] + - [556, 9906.28] - - [4096, 1024, 1, 3436] - - [460, 9858.12] + - [555, 9858.12] - - [1024, 1024, 1, 4005] - - [457, 9043.06] + - [552, 9043.06] - - [64, 84, 752, 84] - - [430, 5629.93] + - [525, 5629.93] - - [4096, 1024, 1, 3574] - - [460, 9886.7] + - [555, 9886.7] - - [4096, 1024, 1, 3469] - - [453, 9856.26] + - [548, 9856.26] - - [1024, 4096, 1, 3410] - - [454, 9924.74] + - [549, 9924.74] - - [1024, 4096, 1, 3216] - - [453, 9930.67] + - [548, 9930.67] - - [4096, 1024, 1, 3095] - - [460, 9847.01] + - [555, 9847.01] - - [1024, 1024, 1, 3990] - - [459, 9089.04] + - [554, 9089.04] - - [4096, 1024, 1, 3448] - - [460, 9863.94] + - [555, 9863.94] - - [1024, 4096, 1, 3176] - - [461, 9914.01] + - [556, 9914.01] - - [64, 49, 1296, 49] - - [426, 4437.46] + - [521, 4437.46] - - [4096, 1024, 1, 2918] - - [460, 9830.93] + - [555, 9830.93] - - [64, 14, 4368, 14] - - [440, 1802.47] + - [535, 1802.47] - - [1024, 4096, 1, 3424] - - [460, 9934.05] + - [555, 9934.05] - - [4096, 1024, 1, 3402] - - [453, 9863.12] + - [548, 9863.12] - - [4096, 1024, 1, 3145] - - [454, 9856.56] + - [549, 9856.56] - - [64, 134, 480, 134] - - [446, 6184.05] + - [541, 6184.05] - - [1024, 33708, 1, 3976] - - [454, 10330.1] + - [549, 10330.1] - - [4096, 1024, 1, 3518] - - [453, 9856.07] + - [548, 9856.07] - - [4096, 1024, 1, 3110] - - [460, 9856.46] + - [555, 9856.46] - - [4096, 1024, 1, 3325] - - [460, 9852.36] + - [555, 9852.36] - - [1024, 33708, 1, 3999] - - [453, 10329.7] + - [548, 10329.7] - - [4096, 1024, 1, 2985] - - [460, 9837.3] + - [555, 9837.3] - - [1024, 4096, 1, 3371] - - [453, 9913.03] + - [548, 9913.03] - - [4096, 1024, 1, 3342] - - [460, 9863.16] + - [555, 9863.16] - - [4096, 1024, 1, 3141] - - [454, 9849.91] + - [549, 9849.91] - - [4096, 1024, 1, 3532] - - [454, 9866.3] + - [549, 9866.3] - - [64, 78, 816, 78] - - [431, 5316.88] + - [526, 5316.88] - - [1024, 4096, 1, 3169] - - [461, 9910.45] + - [556, 9910.45] - - [1024, 4096, 1, 3514] - - [460, 9918.0] + - [555, 9918.0] - - [4096, 1024, 1, 3780] - - [461, 9899.75] + - [556, 9899.75] - - [1024, 4096, 1, 3098] - - [453, 9901.62] + - [548, 9901.62] - - [1024, 4096, 1, 3449] - - [461, 9919.85] + - [556, 9919.85] - - [1024, 4096, 1, 3222] - - [453, 9917.66] + - [548, 9917.66] - - [1024, 4096, 1, 3346] - - [454, 9912.91] + - [549, 9912.91] - - [4096, 1024, 1, 3064] - - [461, 9848.79] + - [556, 9848.79] - - [4096, 1024, 1, 3511] - - [460, 9873.39] + - [555, 9873.39] - - [4096, 1024, 1, 3384] - - [460, 9870.98] + - [555, 9870.98] - - [4096, 1024, 1, 3356] - - [454, 9853.45] + - [549, 9853.45] - - [1024, 4096, 1, 3796] - - [453, 9940.66] + - [548, 9940.66] - - [4096, 1024, 1, 3427] - - [460, 9883.14] + - [555, 9883.14] - - [4096, 1024, 1, 3390] - - [460, 9863.79] + - [555, 9863.79] - - [4096, 1024, 1, 3573] - - [461, 9886.02] + - [556, 9886.02] - - [4096, 1024, 1, 3456] - - [454, 9890.61] + - [549, 9890.61] - - [1024, 4096, 1, 3360] - - [461, 9938.1] + - [556, 9938.1] - - [1024, 33708, 1, 3977] - - [454, 10327.2] + - [549, 10327.2] - - [1024, 4096, 1, 2918] - - [453, 9902.84] + - [548, 9902.84] - - [4096, 1024, 1, 3975] - - [460, 9905.27] + - [555, 9905.27] - - [4096, 1024, 1, 3525] - - [461, 9879.91] + - [556, 9879.91] - - [4096, 1024, 1, 3398] - - [453, 9873.91] + - [548, 9873.91] - - [4096, 1024, 1, 3640] - - [460, 9885.16] + - [555, 9885.16] - - [1024, 1024, 1, 3999] - - [452, 8995.42] + - [547, 8995.42] - - [4096, 1024, 1, 3014] - - [460, 9841.32] + - [555, 9841.32] - - [1024, 4096, 1, 3446] - - [453, 9917.21] + - [548, 9917.21] - - [1024, 33708, 1, 3796] - - [453, 10339.0] + - [548, 10339.0] - - [4096, 1024, 1, 3101] - - [453, 9827.34] + - [548, 9827.34] - - [4096, 1024, 1, 3563] - - [461, 9863.03] + - [556, 9863.03] - - [4096, 1024, 1, 3539] - - [453, 9889.54] + - [548, 9889.54] - - [4096, 1024, 1, 3182] - - [460, 9833.79] + - [555, 9833.79] - - [1024, 4096, 1, 3468] - - [454, 9913.05] + - [549, 9913.05] - - [4096, 1024, 1, 3312] - - [460, 9889.85] + - [555, 9889.85] - - [4096, 1024, 1, 3215] - - [460, 9853.88] + - [555, 9853.88] - - [4096, 1024, 1, 3910] - - [460, 9894.72] + - [555, 9894.72] - - [1024, 33708, 1, 3780] - - [454, 10332.0] + - [549, 10332.0] - - [1024, 4096, 1, 3290] - - [460, 9915.08] + - [555, 9915.08] - - [1024, 4096, 1, 4012] - - [460, 9942.65] + - [555, 9942.65] - - [1024, 4096, 1, 3385] - - [460, 9915.83] + - [555, 9915.83] - - [1024, 33708, 1, 3975] - - [453, 10330.1] + - [548, 10330.1] - - [4096, 1024, 1, 3996] - - [460, 9891.31] + - [555, 9891.31] - - [4096, 1024, 1, 2765] - - [461, 9800.38] + - [556, 9800.38] - - [4096, 1024, 1, 3538] - - [461, 9886.22] + - [556, 9886.22] - - [4096, 1024, 1, 3415] - - [461, 9874.6] + - [556, 9874.6] - - [1024, 4096, 1, 3554] - - [460, 9931.99] + - [555, 9931.99] - - [4096, 1024, 1, 3513] - - [454, 9874.25] + - [549, 9874.25] - - [1024, 4096, 1, 3304] - - [454, 9907.73] + - [549, 9907.73] - - [4096, 1024, 1, 3294] - - [460, 9851.25] + - [555, 9851.25] - - [4096, 1024, 1, 3396] - - [461, 9880.7] + - [556, 9880.7] - - [1024, 4096, 1, 3213] - - [454, 9891.12] + - [549, 9891.12] - - [4096, 1024, 1, 3137] - - [454, 9857.41] + - [549, 9857.41] - - [4096, 1024, 1, 3552] - - [460, 9904.22] + - [555, 9904.22] - - [1024, 1024, 1, 4020] - - [459, 9098.87] + - [554, 9098.87] - - [64, 13, 4672, 13] - - [441, 1693.54] + - [536, 1693.54] - - [1024, 4096, 1, 3461] - - [460, 9918.45] + - [555, 9918.45] - - [4096, 1024, 1, 3263] - - [453, 9843.89] + - [548, 9843.89] - - [4096, 1024, 1, 3430] - - [460, 9885.26] + - [555, 9885.26] - - [4096, 1024, 1, 3389] - - [460, 9859.23] + - [555, 9859.23] - - [4096, 1024, 1, 3528] - - [460, 9873.01] + - [555, 9873.01] - - [1024, 4096, 1, 3463] - - [461, 9929.61] + - [556, 9929.61] - - [4096, 1024, 1, 3526] - - [461, 9876.9] + - [556, 9876.9] - - [4096, 1024, 1, 3154] - - [460, 9858.25] + - [555, 9858.25] - - [4096, 1024, 1, 3499] - - [461, 9862.92] + - [556, 9862.92] - - [1024, 1024, 1, 3939] - - [459, 9107.41] + - [554, 9107.41] - - [4096, 1024, 1, 3955] - - [461, 9906.28] + - [556, 9906.28] - - [1024, 4096, 1, 3297] - - [454, 9925.34] + - [549, 9925.34] - - [1024, 4096, 1, 3233] - - [460, 9920.65] + - [555, 9920.65] - - [1024, 4096, 1, 3226] - - [460, 9911.35] + - [555, 9911.35] - - [4096, 1024, 1, 3404] - - [460, 9867.28] + - [555, 9867.28] - - [4096, 1024, 1, 3355] - - [460, 9862.66] + - [555, 9862.66] - - [1024, 4096, 1, 3542] - - [460, 9926.49] + - [555, 9926.49] - - [4096, 1024, 1, 3181] - - [461, 9831.86] + - [556, 9831.86] - - [1024, 4096, 1, 3474] - - [460, 9928.03] + - [555, 9928.03] - - [4096, 1024, 1, 3319] - - [460, 9870.28] + - [555, 9870.28] - - [1024, 4096, 1, 3434] - - [453, 9917.51] + - [548, 9917.51] - - [1024, 4096, 1, 3860] - - [460, 9945.32] + - [555, 9945.32] - - [1024, 4096, 1, 3343] - - [453, 9914.66] + - [548, 9914.66] - - [64, 77, 816, 78] - - [431, 5276.97] + - [526, 5276.97] - - [1024, 4096, 1, 3488] - - [460, 9945.81] + - [555, 9945.81] - - [1024, 4096, 1, 3046] - - [460, 9908.78] + - [555, 9908.78] - - [1024, 4096, 1, 3141] - - [461, 9909.18] + - [556, 9909.18] - - [1024, 4096, 1, 3516] - - [461, 9911.38] + - [556, 9911.38] - - [4096, 1024, 1, 3147] - - [460, 9840.47] + - [555, 9840.47] - - [1024, 1024, 1, 4059] - - [452, 9009.78] + - [547, 9009.78] - - [1024, 1024, 1, 3944] - - [452, 9006.17] + - [547, 9006.17] - - [1024, 4096, 1, 3421] - - [461, 9919.86] + - [556, 9919.86] - - [4096, 1024, 1, 3944] - - [454, 9899.53] + - [549, 9899.53] - - [64, 45, 1424, 45] - - [439, 4068.67] + - [534, 4068.67] - - [1024, 4096, 1, 3574] - - [454, 9930.19] + - [549, 9930.19] - - [1024, 4096, 1, 3977] - - [453, 9944.28] + - [548, 9944.28] - - [1024, 1024, 1, 3968] - - [458, 9045.22] + - [553, 9045.22] - - [1024, 4096, 1, 2985] - - [460, 9887.65] + - [555, 9887.65] - - [64, 193, 320, 193] - - [447, 6631.35] + - [542, 6631.35] - - [1024, 4096, 1, 3427] - - [461, 9933.41] + - [556, 9933.41] - - [64, 12, 5040, 12] - - [441, 1552.53] + - [536, 1552.53] - - [1024, 4096, 1, 3482] - - [461, 9942.22] + - [556, 9942.22] - - [1024, 4096, 1, 3332] - - [453, 9923.58] + - [548, 9923.58] - - [1024, 1024, 1, 3720] - - [457, 9039.56] + - [552, 9039.56] - - [4096, 1024, 1, 3308] - - [461, 9852.66] + - [556, 9852.66] - - [1024, 4096, 1, 3513] - - [461, 9919.99] + - [556, 9919.99] - - [1024, 4096, 1, 3154] - - [454, 9908.46] + - [549, 9908.46] - - [1024, 4096, 1, 3955] - - [461, 9950.01] + - [556, 9950.01] - - [1024, 4096, 1, 2967] - - [461, 9897.44] + - [556, 9897.44] - - [1024, 33708, 1, 3942] - - [453, 10336.1] + - [548, 10336.1] - - [1024, 4096, 1, 3319] - - [461, 9912.45] + - [556, 9912.45] - - [4096, 1024, 1, 3860] - - [460, 9909.29] + - [555, 9909.29] - - [1024, 4096, 1, 3548] - - [453, 9924.21] + - [548, 9924.21] - - [4096, 1024, 1, 3977] - - [461, 9891.44] + - [556, 9891.44] - - [4096, 1024, 1, 3535] - - [460, 9867.84] + - [555, 9867.84] - - [1024, 4096, 1, 3541] - - [461, 9923.16] + - [556, 9923.16] - - [1024, 1024, 1, 3910] - - [458, 9080.4] + - [553, 9080.4] - - [1024, 33708, 1, 3584] - - [453, 10333.0] + - [548, 10333.0] - - [1024, 4096, 1, 3168] - - [454, 9926.27] + - [549, 9926.27] - - [1024, 4096, 1, 3448] - - [461, 9922.42] + - [556, 9922.42] - - [4096, 1024, 1, 3343] - - [460, 9857.23] + - [555, 9857.23] - - [64, 35, 1808, 35] - - [443, 3175.44] + - [538, 3175.44] - - [1024, 4096, 1, 3357] - - [454, 9902.41] + - [549, 9902.41] - - [64, 143, 432, 143] - - [444, 6489.7] + - [539, 6489.7] - - [4096, 1024, 1, 3510] - - [460, 9867.4] + - [555, 9867.4] - - [4096, 1024, 1, 3369] - - [460, 9863.44] + - [555, 9863.44] - - [64, 92, 688, 93] - - [431, 6188.3] + - [526, 6188.3] - - [4096, 1024, 1, 3379] - - [460, 9870.12] + - [555, 9870.12] - - [1024, 4096, 1, 3276] - - [460, 9904.77] + - [555, 9904.77] - - [1024, 4096, 1, 3363] - - [460, 9925.13] + - [555, 9925.13] - - [4096, 1024, 1, 3055] - - [460, 9831.92] + - [555, 9831.92] - - [1024, 4096, 1, 3524] - - [453, 9923.79] + - [548, 9923.79] - - [4096, 1024, 1, 3057] - - [460, 9852.87] + - [555, 9852.87] - - [1024, 33708, 1, 3720] - - [454, 10327.1] + - [549, 10327.1] - - [1024, 4096, 1, 3383] - - [453, 9919.39] + - [548, 9919.39] - - [1024, 4096, 1, 3522] - - [454, 9932.56] + - [549, 9932.56] - - [1024, 33708, 1, 3956] - - [453, 10333.8] + - [548, 10333.8] - - [1024, 4096, 1, 3481] - - [453, 9922.08] + - [548, 9922.08] - - [4096, 1024, 1, 3562] - - [461, 9874.86] + - [556, 9874.86] - - [4096, 1024, 1, 3299] - - [460, 9872.97] + - [555, 9872.97] - - [1024, 4096, 1, 3262] - - [454, 9924.83] + - [549, 9924.83] - - [1024, 4096, 1, 3840] - - [453, 9961.84] + - [548, 9961.84] - - [1024, 33708, 1, 4026] - - [453, 10334.3] + - [548, 10334.3] - - [4096, 1024, 1, 3168] - - [454, 9878.45] + - [549, 9878.45] - - [64, 101, 624, 101] - - [434, 5734.72] + - [529, 5734.72] - - [1024, 4096, 1, 3999] - - [453, 9947.1] + - [548, 9947.1] - - [1024, 4096, 1, 3549] - - [453, 9923.3] + - [548, 9923.3] - - [4096, 1024, 1, 3375] - - [460, 9868.89] + - [555, 9868.89] - - [1024, 4096, 1, 3496] - - [461, 9928.67] + - [556, 9928.67] - - [64, 29, 2176, 29] - - [430, 3290.02] + - [525, 3290.02] - - [1024, 4096, 1, 3190] - - [461, 9897.61] + - [556, 9897.61] - - [4096, 1024, 1, 3273] - - [461, 9853.65] + - [556, 9853.65] - - [1024, 4096, 1, 3406] - - [460, 9907.04] + - [555, 9907.04] - - [4096, 1024, 1, 4005] - - [453, 9907.97] + - [548, 9907.97] - - [4096, 1024, 1, 3555] - - [460, 9878.96] + - [555, 9878.96] - - [4096, 1024, 1, 2505] - - [460, 9785.1] + - [555, 9785.1] - - [1024, 4096, 1, 3460] - - [460, 9930.24] + - [555, 9930.24] - - [64, 17, 3632, 17] - - [431, 1917.27] + - [526, 1917.27] - - [1024, 4096, 1, 3579] - - [454, 9920.94] + - [549, 9920.94] - - [1024, 33708, 1, 4030] - - [454, 10327.7] + - [549, 10327.7] - - [1024, 4096, 1, 3510] - - [454, 9931.31] + - [549, 9931.31] - - [1024, 1024, 1, 3969] - - [451, 9020.83] + - [546, 9020.83] - - [1024, 4096, 1, 3282] - - [461, 9920.05] + - [556, 9920.05] - - [1024, 4096, 1, 3377] - - [453, 9927.34] + - [548, 9927.34] - - [1024, 4096, 1, 2935] - - [461, 9903.48] + - [556, 9903.48] - - [64, 41, 1552, 41] - - [431, 3740.48] + - [526, 3740.48] - - [1024, 4096, 1, 3498] - - [453, 9915.01] + - [548, 9915.01] - - [1024, 4096, 1, 3593] - - [460, 9925.64] + - [555, 9925.64] - - [1024, 1024, 1, 3948] - - [459, 9009.03] + - [554, 9009.03] - - [4096, 1024, 1, 3226] - - [461, 9854.75] + - [556, 9854.75] - - [1024, 4096, 1, 2499] - - [460, 9904.82] + - [555, 9904.82] - - [1024, 4096, 1, 3296] - - [453, 9926.89] + - [548, 9926.89] - - [1024, 4096, 1, 3455] - - [460, 9917.52] + - [555, 9917.52] - - [1024, 4096, 1, 3399] - - [454, 9919.7] + - [549, 9919.7] - - [1024, 4096, 1, 3205] - - [453, 9917.74] + - [548, 9917.74] - - [4096, 1024, 1, 4026] - - [461, 9897.81] + - [556, 9897.81] - - [1024, 4096, 1, 3484] - - [453, 9915.53] + - [548, 9915.53] - - [4096, 1024, 1, 3302] - - [461, 9862.8] + - [556, 9862.8] - - [1024, 4096, 1, 3485] - - [461, 9913.0] + - [556, 9913.0] - - [1024, 1024, 1, 3996] - - [459, 9008.77] + - [554, 9008.77] - - [1024, 4096, 1, 3126] - - [454, 9910.16] + - [549, 9910.16] - - [1024, 4096, 1, 4050] - - [453, 9951.21] + - [548, 9951.21] - - [4096, 1024, 1, 3235] - - [454, 9870.74] + - [549, 9870.74] - - [1024, 33708, 1, 3955] - - [453, 10336.1] + - [548, 10336.1] - - [1024, 4096, 1, 3342] - - [453, 9903.85] + - [548, 9903.85] - - [1024, 1024, 1, 3900] - - [458, 9082.92] + - [553, 9082.92] - - [1024, 4096, 1, 3397] - - [461, 9922.7] + - [556, 9922.7] - - [4096, 1024, 1, 3491] - - [461, 9880.75] + - [556, 9880.75] - - [1024, 4096, 1, 3503] - - [453, 9923.28] + - [548, 9923.28] - - [1024, 4096, 1, 3140] - - [454, 9908.41] + - [549, 9908.41] - - [4096, 1024, 1, 3121] - - [460, 9860.32] + - [555, 9860.32] - - [4096, 1024, 1, 3276] - - [460, 9854.19] + - [555, 9854.19] - - [1024, 4096, 1, 3321] - - [461, 9917.86] + - [556, 9917.86] - - [1024, 4096, 1, 3870] - - [461, 9931.07] + - [556, 9931.07] - - [4096, 1024, 1, 3475] - - [460, 9877.58] + - [555, 9877.58] - - [1024, 4096, 1, 2984] - - [460, 9895.59] + - [555, 9895.59] - - [4096, 1024, 1, 3363] - - [454, 9873.44] + - [549, 9873.44] - - [1024, 4096, 1, 3582] - - [460, 9920.87] + - [555, 9920.87] - - [4096, 1024, 1, 3509] - - [460, 9886.86] + - [555, 9886.86] - - [1024, 4096, 1, 3426] - - [453, 9928.86] + - [548, 9928.86] - - [4096, 1024, 1, 3136] - - [460, 9872.61] + - [555, 9872.61] - - [1024, 4096, 1, 3232] - - [461, 9926.29] + - [556, 9926.29] - - [4096, 1024, 1, 3103] - - [460, 9839.03] + - [555, 9839.03] - - [1024, 4096, 1, 3335] - - [454, 9913.37] + - [549, 9913.37] - - [1024, 4096, 1, 3900] - - [453, 9938.01] + - [548, 9938.01] - - [4096, 1024, 1, 3512] - - [454, 9877.26] + - [549, 9877.26] - - [4096, 1024, 1, 3222] - - [460, 9859.77] + - [555, 9859.77] - - [1024, 4096, 1, 3165] - - [460, 9899.71] + - [555, 9899.71] - - [4096, 1024, 1, 3408] - - [460, 9899.68] + - [555, 9899.68] - - [4096, 1024, 1, 3751] - - [460, 9891.49] + - [555, 9891.49] - - [1024, 4096, 1, 3318] - - [453, 9913.42] + - [548, 9913.42] - - [4096, 1024, 1, 3442] - - [461, 9880.21] + - [556, 9880.21] - - [1024, 4096, 1, 3413] - - [460, 9921.9] + - [555, 9921.9] - - [4096, 1024, 1, 3524] - - [460, 9879.22] + - [555, 9879.22] - - [1024, 4096, 1, 3976] - - [461, 9945.57] + - [556, 9945.57] - - [1024, 4096, 1, 3475] - - [461, 9932.51] + - [556, 9932.51] - - [1024, 4096, 1, 3534] - - [453, 9911.49] + - [548, 9911.49] - - [4096, 1024, 1, 3301] - - [460, 9872.75] + - [555, 9872.75] - - [4096, 1024, 1, 3248] - - [460, 9878.22] + - [555, 9878.22] - - [1024, 4096, 1, 2977] - - [454, 9899.93] + - [549, 9899.93] - - [4096, 1024, 1, 3346] - - [460, 9876.07] + - [555, 9876.07] - - [1024, 4096, 1, 3451] - - [453, 9920.16] + - [548, 9920.16] - - [1024, 4096, 1, 3257] - - [454, 9905.02] + - [549, 9905.02] - - [1024, 1024, 1, 3640] - - [452, 8983.39] + - [547, 8983.39] - - [1024, 4096, 1, 3356] - - [453, 9904.48] + - [548, 9904.48] - - [4096, 1024, 1, 3348] - - [461, 9872.53] + - [556, 9872.53] - - [4096, 1024, 1, 3335] - - [460, 9865.82] + - [555, 9865.82] - - [4096, 1024, 1, 3505] - - [460, 9888.88] + - [555, 9888.88] - - [1024, 4096, 1, 3490] - - [453, 9938.0] + - [548, 9938.0] - - [4096, 1024, 1, 3447] - - [460, 9865.39] + - [555, 9865.39] - - [1024, 4096, 1, 3267] - - [461, 9919.32] + - [556, 9919.32] - - [4096, 1024, 1, 3230] - - [460, 9853.2] + - [555, 9853.2] - - [4096, 1024, 1, 3455] - - [460, 9862.44] + - [555, 9862.44] - - [1024, 4096, 1, 3925] - - [453, 9945.64] + - [548, 9945.64] - - [1024, 4096, 1, 3362] - - [454, 9921.63] + - [549, 9921.63] - - [4096, 1024, 1, 3969] - - [461, 9911.98] + - [556, 9911.98] - - [4096, 1024, 1, 3527] - - [460, 9882.87] + - [555, 9882.87] - - [1024, 4096, 1, 3585] - - [454, 9946.52] + - [549, 9946.52] - - [4096, 1024, 1, 3063] - - [460, 9854.03] + - [555, 9854.03] - - [4096, 1024, 1, 3435] - - [460, 9867.13] + - [555, 9867.13] - - [4096, 1024, 1, 3366] - - [461, 9864.02] + - [556, 9864.02] - - [4096, 1024, 1, 3581] - - [453, 9868.57] + - [548, 9868.57] - - [1024, 33708, 1, 3906] - - [453, 10339.3] + - [548, 10339.3] - - [1024, 4096, 1, 3464] - - [461, 9916.21] + - [556, 9916.21] - - [1024, 4096, 1, 3440] - - [460, 9945.25] + - [555, 9945.25] - - [4096, 1024, 1, 3143] - - [460, 9846.76] + - [555, 9846.76] - - [1024, 4096, 1, 3349] - - [454, 9912.83] + - [549, 9912.83] - - [4096, 1024, 1, 3416] - - [460, 9885.13] + - [555, 9885.13] - - [4096, 1024, 1, 3365] - - [460, 9876.0] + - [555, 9876.0] - - [1024, 4096, 1, 3470] - - [461, 9914.98] + - [556, 9914.98] - - [4096, 1024, 1, 3287] - - [460, 9860.69] + - [555, 9860.69] - - [1024, 4096, 1, 3441] - - [461, 9928.98] + - [556, 9928.98] - - [4096, 1024, 1, 3224] - - [460, 9857.83] + - [555, 9857.83] - - [1024, 4096, 1, 3387] - - [453, 9911.72] + - [548, 9911.72] - - [1024, 4096, 1, 3547] - - [453, 9920.36] + - [548, 9920.36] - - [4096, 1024, 1, 3478] - - [454, 9882.9] + - [549, 9882.9] - - [4096, 1024, 1, 3548] - - [461, 9869.45] + - [556, 9869.45] - - [1024, 33708, 1, 4020] - - [453, 10345.3] + - [548, 10345.3] - - [4096, 1024, 1, 3320] - - [460, 9863.74] + - [555, 9863.74] - - [1024, 4096, 1, 3906] - - [460, 9942.67] + - [555, 9942.67] - - [4096, 1024, 1, 3796] - - [460, 9899.13] + - [555, 9899.13] - - [1024, 4096, 1, 3306] - - [453, 9902.4] + - [548, 9902.4] - - [1024, 4096, 1, 3401] - - [461, 9913.95] + - [556, 9913.95] - - [64, 147, 432, 147] - - [444, 6626.6] + - [539, 6626.6] - - [1024, 4096, 1, 3215] - - [461, 9911.24] + - [556, 9911.24] - - [4096, 1024, 1, 4012] - - [461, 9898.2] + - [556, 9898.2] - - [1024, 4096, 1, 2765] - - [461, 9863.73] + - [556, 9863.73] - - [4096, 1024, 1, 3554] - - [454, 9883.52] + - [549, 9883.52] - - [4096, 1024, 1, 3423] - - [460, 9866.72] + - [555, 9866.72] - - [1024, 1024, 1, 3751] - - [458, 9006.36] + - [553, 9006.36] - - [1024, 4096, 1, 3562] - - [454, 9922.08] + - [549, 9922.08] - - [1024, 4096, 1, 3489] - - [453, 9936.78] + - [548, 9936.78] - - [4096, 1024, 1, 3358] - - [460, 9858.22] + - [555, 9858.22] - - [4096, 1024, 1, 3270] - - [461, 9850.84] + - [556, 9850.84] - - [1024, 4096, 1, 3293] - - [453, 9905.33] + - [548, 9905.33] - - [1024, 4096, 1, 3376] - - [453, 9934.98] + - [548, 9934.98] - - [4096, 1024, 1, 3245] - - [460, 9852.52] + - [555, 9852.52] - - [4096, 1024, 1, 3541] - - [460, 9887.22] + - [555, 9887.22] - - [4096, 1024, 1, 3443] - - [460, 9871.73] + - [555, 9871.73] - - [4096, 1024, 1, 3438] - - [461, 9863.86] + - [556, 9863.86] - - [4096, 1024, 1, 3244] - - [460, 9859.76] + - [555, 9859.76] - - [1024, 4096, 1, 3365] - - [460, 9922.1] + - [555, 9922.1] - - [1024, 4096, 1, 3299] - - [454, 9923.38] + - [549, 9923.38] - - [4096, 1024, 1, 3840] - - [460, 9914.75] + - [555, 9914.75] - - [1024, 4096, 1, 3471] - - [461, 9918.38] + - [556, 9918.38] - - [1024, 4096, 1, 3398] - - [453, 9918.99] + - [548, 9918.99] - - [4096, 1024, 1, 3162] - - [460, 9843.93] + - [555, 9843.93] - - [1024, 4096, 1, 4005] - - [454, 9947.87] + - [549, 9947.87] - - [4096, 1024, 1, 3579] - - [460, 9868.25] + - [555, 9868.25] - - [64, 18, 3440, 18] - - [436, 2059.33] + - [531, 2059.33] - - [64, 177, 352, 177] - - [455, 7315.4] + - [550, 7315.4] - - [1024, 4096, 1, 3121] - - [461, 9930.34] + - [556, 9930.34] - - [4096, 1024, 1, 3441] - - [460, 9883.28] + - [555, 9883.28] - - [4096, 1024, 1, 3422] - - [460, 9858.41] + - [555, 9858.41] - - [4096, 1024, 1, 3444] - - [460, 9887.03] + - [555, 9887.03] - - [1024, 4096, 1, 3337] - - [454, 9911.45] + - [549, 9911.45] - - [4096, 1024, 1, 3550] - - [453, 9871.87] + - [548, 9871.87] - - [1024, 4096, 1, 3477] - - [453, 9930.65] + - [548, 9930.65] - - [4096, 1024, 1, 3490] - - [460, 9878.45] + - [555, 9878.45] - - [4096, 1024, 1, 3585] - - [460, 9893.63] + - [555, 9893.63] - - [1024, 4096, 1, 3143] - - [453, 9901.19] + - [548, 9901.19] - - [1024, 33708, 1, 3876] - - [454, 10330.8] + - [549, 10330.8] - - [1024, 4096, 1, 3320] - - [461, 9913.18] + - [556, 9913.18] - - [1024, 4096, 1, 3423] - - [461, 9914.14] + - [556, 9914.14] - - [1024, 4096, 1, 3894] - - [453, 9944.47] + - [548, 9944.47] - - [4096, 1024, 1, 3410] - - [460, 9878.67] + - [555, 9878.67] - - [1024, 4096, 1, 3561] - - [453, 9926.68] + - [548, 9926.68] - - [4096, 1024, 1, 3492] - - [454, 9872.92] + - [549, 9872.92] - - [64, 85, 752, 85] - - [431, 5734.35] + - [526, 5734.35] - - [36548, 1024, 1, 3712] - - [463, 10367.6] + - [558, 10367.6] - - [4096, 2048, 1, 128] - - [464, 8743.93] + - [559, 8743.93] - - [1024, 1024, 1, 3712] - - [465, 9976.29] + - [560, 9976.29] - - [1024, 1024, 1, 128] - - [462, 5765.47] + - [557, 5765.47] - - [4096, 3072, 1, 128] - - [464, 8869.11] + - [559, 8869.11] - - [768, 3072, 1, 4096] - - [476, 10028.8] + - [571, 10028.8] - - [64, 256, 192, 256] - - [470, 8791.65] + - [565, 8791.65] - - [768, 2, 1, 16] - - [473, 5.05484] + - [568, 5.05484] - - [768, 768, 1, 64] - - [469, 3469.65] + - [564, 3469.65] - - [768, 768, 1, 4096] - - [477, 7475.1] + - [572, 7475.1] - - [768, 30522, 1, 1280] - - [480, 10297.0] + - [575, 10297.0] - - [64, 128, 384, 128] - - [470, 7660.93] + - [565, 7660.93] - - [768, 30522, 1, 320] - - [478, 10008.0] + - [573, 10008.0] - - [768, 768, 1, 32] - - [467, 2359.4] + - [562, 2359.4] - - [3072, 768, 1, 4096] - - [476, 10033.8] + - [571, 10033.8] - - [768, 30522, 1, 640] - - [479, 10206.8] + - [574, 10206.8] - - [64, 64, 768, 64] - - [468, 5494.82] + - [563, 5494.82] - - [768, 768, 1, 640] - - [477, 6721.74] + - [572, 6721.74] - - [768, 768, 1, 16] - - [466, 1203.82] + - [561, 1203.82] - - [768, 768, 1, 1280] - - [475, 7138.67] + - [570, 7138.67] - - [768, 2, 1, 32] - - [471, 11.9154] + - [566, 11.9154] - - [2048, 2048, 1, 512] - - [491, 9607.67] + - [586, 9607.67] - - [512, 32, 1, 200] - - [484, 422.368] + - [579, 422.368] - - [1024, 1, 1, 200] - - [487, 24.7154] + - [582, 24.7154] - - [1600, 1024, 1, 512] - - [482, 8116.01] + - [577, 8116.01] - - [560, 1024, 1, 200] - - [481, 4810.84] + - [576, 4810.84] - - [1024, 1024, 1, 512] - - [490, 8614.84] + - [585, 8614.84] - - [2048, 1, 1, 512] - - [485, 81.0086] + - [580, 81.0086] - - [512, 512, 1, 200] - - [483, 4398.49] + - [578, 4398.49] - - [100, 2048, 1, 512] - - [488, 4443.22] + - [583, 4443.22] - - [1024, 1024, 1, 200] - - [489, 6990.61] + - [584, 6990.61] - - [1024, 64, 1, 512] - - [486, 2853.37] + - [581, 2853.37] - - [1024, 256, 1, 18944] - - [510, 9196.51] + - [605, 9196.51] - - [256, 3328, 1, 8976] - - [500, 8299.36] + - [595, 8299.36] - - [1024, 256, 1, 4352] - - [508, 8813.84] + - [603, 8813.84] - - [256, 9728, 1, 8976] - - [503, 9638.58] + - [598, 9638.58] - - [1024, 256, 1, 3072] - - [510, 8640.73] + - [605, 8640.73] - - [768, 2048, 1, 256] - - [502, 8663.03] + - [597, 8663.03] - - [1024, 256, 1, 19968] - - [507, 9220.96] + - [602, 9220.96] - - [256, 12800, 1, 8976] - - [497, 9418.52] + - [592, 9418.52] - - [1024, 256, 1, 3328] - - [511, 8682.58] + - [606, 8682.58] - - [256, 10240, 1, 8976] - - [504, 10137.8] + - [599, 10137.8] - - [1024, 256, 1, 15104] - - [509, 9167.13] + - [604, 9167.13] - - [256, 10496, 1, 8976] - - [497, 9858.48] + - [592, 9858.48] - - [1024, 256, 1, 2816] - - [512, 8575.81] + - [607, 8575.81] - - [1024, 256, 1, 4608] - - [507, 8861.31] + - [602, 8861.31] - - [256, 11264, 1, 8976] - - [494, 9627.79] + - [589, 9627.79] - - [1024, 256, 1, 6400] - - [507, 8985.33] + - [602, 8985.33] - - [1024, 256, 1, 16128] - - [507, 9170.36] + - [602, 9170.36] - - [256, 44505, 1, 8976] - - [501, 10331.9] + - [596, 10331.9] - - [256, 6144, 1, 8976] - - [504, 10395.1] + - [599, 10395.1] - - [1024, 256, 1, 5120] - - [509, 8881.63] + - [604, 8881.63] - - [1024, 256, 1, 7936] - - [512, 9023.24] + - [607, 9023.24] - - [256, 3840, 1, 8976] - - [499, 9541.38] + - [594, 9541.38] - - [1024, 256, 1, 21248] - - [507, 9209.82] + - [602, 9209.82] - - [1024, 256, 1, 12032] - - [509, 9156.27] + - [604, 9156.27] - - [256, 8192, 1, 8976] - - [506, 10374.5] + - [601, 10374.5] - - [1024, 256, 1, 3584] - - [508, 8712.3] + - [603, 8712.3] - - [1024, 256, 1, 14336] - - [509, 9162.61] + - [604, 9162.61] - - [256, 7168, 1, 8976] - - [495, 9554.96] + - [590, 9554.96] - - [1024, 256, 1, 13568] - - [507, 9165.14] + - [602, 9165.14] - - [256, 4096, 1, 8976] - - [499, 10146.7] + - [594, 10146.7] - - [1024, 256, 1, 4096] - - [508, 8783.98] + - [603, 8783.98] - - [256, 2560, 1, 8976] - - [498, 8381.66] + - [593, 8381.66] - - [256, 20992, 1, 8976] - - [497, 9989.96] + - [592, 9989.96] - - [256, 4352, 1, 8976] - - [498, 9635.02] + - [593, 9635.02] - - [256, 33536, 1, 8976] - - [497, 10218.2] + - [592, 10218.2] - - [256, 3584, 1, 8976] - - [499, 8924.6] + - [594, 8924.6] - - [256, 26112, 1, 8976] - - [498, 10272.4] + - [593, 10272.4] - - [256, 14336, 1, 8976] - - [502, 10217.4] + - [597, 10217.4] - - [1024, 256, 1, 14848] - - [509, 9185.29] + - [604, 9185.29] - - [1024, 256, 1, 8448] - - [510, 9025.99] + - [605, 9025.99] - - [1024, 256, 1, 28672] - - [507, 9256.5] + - [602, 9256.5] - - [1024, 256, 1, 5632] - - [507, 8932.79] + - [602, 8932.79] - - [256, 22016, 1, 8976] - - [502, 10152.0] + - [597, 10152.0] - - [1024, 256, 1, 33536] - - [507, 9243.17] + - [602, 9243.17] - - [256, 5120, 1, 8976] - - [493, 9418.15] + - [588, 9418.15] - - [256, 11520, 1, 8976] - - [500, 9701.1] + - [595, 9701.1] - - [256, 19968, 1, 8976] - - [498, 10228.1] + - [593, 10228.1] - - [1024, 256, 1, 5376] - - [509, 8892.62] + - [604, 8892.62] - - [1024, 256, 1, 22016] - - [507, 9244.34] + - [602, 9244.34] - - [256, 8960, 1, 8976] - - [498, 9841.41] + - [593, 9841.41] - - [1024, 256, 1, 15872] - - [507, 9223.25] + - [602, 9223.25] - - [256, 17408, 1, 8976] - - [502, 9785.87] + - [597, 9785.87] - - [256, 5632, 1, 8976] - - [502, 9564.32] + - [597, 9564.32] - - [256, 32512, 1, 8976] - - [501, 10358.0] + - [596, 10358.0] - - [256, 11008, 1, 8976] - - [494, 9445.23] + - [589, 9445.23] - - [1024, 256, 1, 6144] - - [509, 8955.91] + - [604, 8955.91] - - [256, 4864, 1, 8976] - - [494, 8979.45] + - [589, 8979.45] - - [256, 15104, 1, 8976] - - [497, 10007.1] + - [592, 10007.1] - - [1024, 256, 1, 9984] - - [507, 9110.53] + - [602, 9110.53] - - [256, 1280, 1, 8976] - - [493, 5944.44] + - [588, 5944.44] - - [1024, 256, 1, 1024] - - [509, 7005.2] + - [604, 7005.2] - - [1024, 256, 1, 9728] - - [509, 9066.29] + - [604, 9066.29] - - [1024, 256, 1, 10496] - - [507, 9118.15] + - [602, 9118.15] - - [256, 11776, 1, 8976] - - [504, 9911.74] + - [599, 9911.74] - - [256, 12544, 1, 8976] - - [497, 9235.35] + - [592, 9235.35] - - [1024, 256, 1, 17152] - - [507, 9152.31] + - [602, 9152.31] - - [1024, 256, 1, 11520] - - [509, 9146.87] + - [604, 9146.87] - - [1024, 256, 1, 21504] - - [509, 9207.52] + - [604, 9207.52] - - [256, 17152, 1, 8976] - - [496, 9654.81] + - [591, 9654.81] - - [1024, 256, 1, 17408] - - [507, 9181.27] + - [602, 9181.27] - - [256, 15872, 1, 8976] - - [505, 10086.5] + - [600, 10086.5] - - [256, 18688, 1, 8976] - - [498, 9612.57] + - [593, 9612.57] - - [256, 5888, 1, 8976] - - [502, 9988.43] + - [597, 9988.43] - - [512, 2048, 1, 256] - - [492, 7678.46] + - [587, 7678.46] - - [1024, 256, 1, 7680] - - [510, 9033.06] + - [605, 9033.06] - - [1024, 256, 1, 1280] - - [512, 7767.33] + - [607, 7767.33] - - [256, 14848, 1, 8976] - - [498, 9852.76] + - [593, 9852.76] - - [256, 9984, 1, 8976] - - [504, 9908.97] + - [599, 9908.97] - - [256, 20480, 1, 8976] - - [502, 10337.2] + - [597, 10337.2] - - [1024, 256, 1, 8192] - - [509, 9044.42] + - [604, 9044.42] - - [1024, 256, 1, 19712] - - [508, 9184.28] + - [603, 9184.28] - - [256, 13568, 1, 8976] - - [498, 9927.92] + - [593, 9927.92] - - [256, 13312, 1, 8976] - - [497, 9758.01] + - [592, 9758.01] - - [256, 2816, 1, 8976] - - [497, 9191.53] + - [592, 9191.53] - - [1024, 256, 1, 2304] - - [508, 8445.01] + - [603, 8445.01] - - [256, 21248, 1, 8976] - - [498, 10127.6] + - [593, 10127.6] - - [256, 16128, 1, 8976] - - [506, 10238.5] + - [601, 10238.5] - - [256, 512, 36, 98] - - [529, 7994.95] + - [624, 7994.95] - - [64, 192, 36, 25088] - - [598, 8613.99] + - [693, 8613.99] - - [128, 128, 64, 25] - - [528, 2540.25] + - [623, 2540.25] - - [256, 256, 64, 56] - - [529, 6924.66] + - [624, 6924.66] - - [512, 486, 36, 800] - - [536, 8994.94] + - [631, 8994.94] - - [512, 512, 36, 1568] - - [547, 9872.48] + - [642, 9872.48] - - [64, 192, 64, 3200] - - [592, 9295.99] + - [687, 9295.99] - - [256, 384, 36, 4096] - - [592, 9334.71] + - [687, 9334.71] - - [128, 256, 64, 32] - - [531, 4280.0] + - [626, 4280.0] - - [64, 128, 64, 23104] - - [598, 10103.2] + - [693, 10103.2] - - [128, 256, 64, 9] - - [522, 1709.73] + - [617, 1709.73] - - [256, 512, 36, 784] - - [532, 9520.83] + - [627, 9520.83] - - [256, 324, 36, 32] - - [570, 4473.48] + - [665, 4473.48] - - [512, 512, 36, 33] - - [541, 5925.27] + - [636, 5925.27] - - [16, 32, 36, 5760] - - [545, 1448.9] + - [640, 1448.9] - - [192, 384, 64, 128] - - [592, 8618.53] + - [687, 8618.53] - - [512, 512, 64, 72] - - [548, 8260.22] + - [643, 8260.22] - - [128, 128, 64, 1600] - - [521, 9008.48] + - [616, 9008.48] - - [512, 512, 36, 128] - - [592, 8871.72] + - [687, 8871.72] - - [192, 384, 64, 2304] - - [521, 9657.26] + - [616, 9657.26] - - [384, 256, 64, 450] - - [557, 9539.03] + - [652, 9539.03] - - [3, 64, 36, 6272] - - [545, 509.884] + - [640, 509.884] - - [3, 64, 64, 2888] - - [574, 708.721] + - [669, 708.721] - - [384, 256, 64, 2304] - - [557, 10287.6] + - [652, 10287.6] - - [512, 512, 64, 144] - - [592, 9226.8] + - [687, 9226.8] - - [256, 256, 36, 6272] - - [532, 9607.38] + - [627, 9607.38] - - [80, 192, 64, 4608] - - [593, 7348.03] + - [688, 7348.03] - - [64, 64, 36, 3136] - - [580, 5959.15] + - [675, 5959.15] - - [256, 384, 64, 2304] - - [557, 10283.5] + - [652, 10283.5] - - [512, 512, 36, 66] - - [541, 7618.18] + - [636, 7618.18] - - [128, 256, 64, 800] - - [567, 9611.25] + - [662, 9611.25] - - [64, 128, 36, 30] - - [523, 1242.71] + - [618, 1242.71] - - [192, 256, 36, 512] - - [592, 8658.07] + - [687, 8658.07] - - [256, 512, 64, 200] - - [592, 9153.97] + - [687, 9153.97] - - [256, 512, 64, 25] - - [570, 5349.98] + - [665, 5349.98] - - [3, 64, 64, 46208] - - [573, 808.662] + - [668, 808.662] - - [128, 256, 36, 1568] - - [565, 8528.72] + - [660, 8528.72] - - [64, 128, 64, 11552] - - [598, 9997.1] + - [693, 9997.1] - - [128, 192, 64, 946] - - [592, 9198.48] + - [687, 9198.48] - - [64, 192, 64, 12800] - - [553, 9000.76] + - [648, 9000.76] - - [224, 224, 64, 128] - - [530, 6312.17] + - [625, 6312.17] - - [128, 256, 64, 288] - - [592, 8697.97] + - [687, 8697.97] - - [64, 64, 64, 826] - - [535, 6650.31] + - [630, 6650.31] - - [256, 384, 64, 1152] - - [567, 10106.9] + - [662, 10106.9] - - [3, 64, 64, 92416] - - [573, 812.131] + - [668, 812.131] - - [32, 32, 36, 43808] - - [514, 2813.19] + - [609, 2813.19] - - [160, 320, 64, 288] - - [524, 8090.96] + - [619, 8090.96] - - [1, 16, 36, 23040] - - [561, 42.7667] + - [656, 42.7667] - - [128, 256, 36, 128] - - [539, 6049.58] + - [634, 6049.58] - - [128, 128, 64, 3360] - - [592, 9200.06] + - [687, 9200.06] - - [128, 128, 64, 420] - - [592, 8131.6] + - [687, 8131.6] - - [64, 128, 64, 361] - - [529, 6938.08] + - [624, 6938.08] - - [512, 512, 36, 16] - - [585, 3797.76] + - [680, 3797.76] - - [384, 256, 36, 800] - - [526, 9151.75] + - [621, 9151.75] - - [192, 384, 36, 4096] - - [526, 8867.67] + - [621, 8867.67] - - [64, 64, 64, 1600] - - [578, 7931.84] + - [673, 7931.84] - - [256, 384, 64, 576] - - [558, 9745.9] + - [653, 9745.9] - - [512, 512, 64, 14] - - [541, 3638.28] + - [636, 3638.28] - - [512, 512, 36, 8] - - [516, 2279.61] + - [611, 2279.61] - - [512, 486, 64, 128] - - [532, 8337.93] + - [627, 8337.93] - - [1, 16, 64, 640] - - [566, 50.0512] + - [661, 50.0512] - - [64, 96, 64, 288] - - [591, 5708.07] + - [686, 5708.07] - - [96, 96, 36, 1568] - - [560, 6866.85] + - [655, 6866.85] - - [256, 256, 36, 128] - - [564, 7703.92] + - [659, 7703.92] - - [64, 128, 36, 53824] - - [552, 6331.41] + - [647, 6331.41] - - [256, 256, 36, 32] - - [548, 4648.96] + - [643, 4648.96] - - [192, 256, 64, 288] - - [592, 8987.89] + - [687, 8987.89] - - [256, 256, 36, 16] - - [562, 2912.81] + - [657, 2912.81] - - [128, 256, 36, 3200] - - [565, 8680.37] + - [660, 8680.37] - - [160, 320, 64, 512] - - [524, 8449.54] + - [619, 8449.54] - - [128, 160, 36, 512] - - [535, 7215.07] + - [630, 7215.07] - - [96, 96, 36, 2592] - - [530, 7104.89] + - [625, 7104.89] - - [64, 96, 64, 800] - - [560, 7268.42] + - [655, 7268.42] - - [147, 64, 36, 18816] - - [576, 7116.36] + - [671, 7116.36] - - [160, 320, 36, 512] - - [530, 7874.92] + - [625, 7874.92] - - [256, 512, 36, 4] - - [569, 1034.88] + - [664, 1034.88] - - [96, 128, 64, 946] - - [552, 7901.17] + - [647, 7901.17] - - [256, 324, 64, 1568] - - [557, 8589.63] + - [652, 8589.63] - - [128, 128, 64, 50] - - [548, 4070.66] + - [643, 4070.66] - - [35, 96, 36, 8960] - - [542, 4207.4] + - [637, 4207.4] - - [32, 64, 36, 43808] - - [583, 4390.91] + - [678, 4390.91] - - [160, 224, 36, 128] - - [530, 5447.02] + - [625, 5447.02] - - [64, 64, 64, 81] - - [555, 2391.28] + - [650, 2391.28] - - [256, 256, 36, 3200] - - [521, 9559.65] + - [616, 9559.65] - - [256, 256, 36, 210] - - [532, 8414.71] + - [627, 8414.71] - - [192, 384, 64, 576] - - [592, 9468.85] + - [687, 9468.85] - - [512, 512, 64, 800] - - [567, 10096.5] + - [662, 10096.5] - - [512, 24, 36, 800] - - [518, 4761.87] + - [613, 4761.87] - - [64, 64, 64, 13216] - - [579, 8491.51] + - [674, 8491.51] - - [192, 224, 64, 1152] - - [535, 8769.16] + - [630, 8769.16] - - [256, 256, 64, 1152] - - [557, 9988.19] + - [652, 9988.19] - - [512, 486, 64, 512] - - [567, 9254.77] + - [662, 9254.77] - - [128, 128, 36, 784] - - [530, 7468.16] + - [625, 7468.16] - - [256, 512, 64, 1600] - - [554, 10232.6] + - [649, 10232.6] - - [512, 512, 64, 9] - - [548, 2599.88] + - [643, 2599.88] - - [96, 128, 64, 288] - - [560, 6599.53] + - [655, 6599.53] - - [64, 96, 36, 512] - - [560, 5073.85] + - [655, 5073.85] - - [256, 512, 36, 1568] - - [592, 9637.91] + - [687, 9637.91] - - [128, 128, 64, 400] - - [592, 8192.1] + - [687, 8192.1] - - [128, 128, 64, 800] - - [592, 8716.44] + - [687, 8716.44] - - [96, 128, 36, 512] - - [580, 6757.03] + - [675, 6757.03] - - [16, 32, 36, 360] - - [543, 754.136] + - [638, 754.136] - - [128, 256, 64, 3200] - - [557, 10222.6] + - [652, 10222.6] - - [96, 128, 64, 800] - - [560, 7968.0] + - [655, 7968.0] - - [256, 512, 64, 4] - - [522, 1098.09] + - [617, 1098.09] - - [256, 256, 64, 450] - - [567, 9347.55] + - [662, 9347.55] - - [64, 64, 64, 3200] - - [578, 8518.18] + - [673, 8518.18] - - [192, 224, 64, 128] - - [538, 7035.27] + - [633, 7035.27] - - [128, 128, 64, 288] - - [592, 7751.38] + - [687, 7751.38] - - [256, 256, 64, 72] - - [548, 7489.93] + - [643, 7489.93] - - [96, 208, 36, 512] - - [560, 6939.21] + - [655, 6939.21] - - [128, 256, 36, 3136] - - [535, 8669.43] + - [630, 8669.43] - - [64, 64, 36, 3520] - - [530, 6007.57] + - [625, 6007.57] - - [64, 128, 36, 1568] - - [593, 6897.8] + - [688, 6897.8] - - [160, 320, 64, 242] - - [519, 7873.27] + - [614, 7873.27] - - [192, 192, 36, 512] - - [530, 7707.42] + - [625, 7707.42] - - [512, 512, 36, 512] - - [592, 9582.52] + - [687, 9582.52] - - [1, 16, 64, 10240] - - [544, 71.4511] + - [639, 71.4511] - - [128, 128, 36, 512] - - [530, 7149.48] + - [625, 7149.48] - - [512, 512, 36, 256] - - [521, 9384.5] + - [616, 9384.5] - - [512, 512, 36, 1024] - - [515, 9777.99] + - [610, 9777.99] - - [96, 208, 64, 1152] - - [593, 7851.0] + - [688, 7851.0] - - [128, 192, 64, 3200] - - [521, 9490.92] + - [616, 9490.92] - - [256, 256, 36, 4096] - - [526, 9585.56] + - [621, 9585.56] - - [160, 160, 64, 288] - - [560, 7299.9] + - [655, 7299.9] - - [256, 256, 64, 896] - - [557, 9850.43] + - [652, 9850.43] - - [128, 256, 64, 242] - - [592, 8391.48] + - [687, 8391.48] - - [128, 128, 36, 440] - - [535, 6274.82] + - [630, 6274.82] - - [96, 128, 36, 1568] - - [580, 7875.13] + - [675, 7875.13] - - [192, 384, 36, 1024] - - [526, 8715.82] + - [621, 8715.82] - - [64, 96, 36, 10368] - - [597, 7478.69] + - [692, 7478.69] - - [128, 256, 64, 100] - - [541, 7085.07] + - [636, 7085.07] - - [112, 224, 36, 2048] - - [534, 7556.02] + - [629, 7556.02] - - [384, 256, 64, 1152] - - [557, 10102.4] + - [652, 10102.4] - - [192, 384, 36, 128] - - [592, 7543.14] + - [687, 7543.14] - - [128, 128, 36, 7040] - - [565, 7600.7] + - [660, 7600.7] - - [128, 256, 64, 1568] - - [557, 10006.0] + - [652, 10006.0] - - [128, 128, 36, 1568] - - [549, 7848.4] + - [644, 7848.4] - - [128, 256, 64, 72] - - [572, 6553.7] + - [667, 6553.7] - - [256, 256, 36, 12544] - - [586, 9365.14] + - [681, 9365.14] - - [256, 256, 36, 105] - - [548, 7286.16] + - [643, 7286.16] - - [128, 256, 36, 392] - - [535, 7625.79] + - [630, 7625.79] - - [64, 64, 64, 5408] - - [578, 8882.77] + - [673, 8882.77] - - [3, 64, 36, 25088] - - [545, 529.042] + - [640, 529.042] - - [384, 256, 36, 1024] - - [592, 9182.85] + - [687, 9182.85] - - [35, 96, 36, 13440] - - [599, 4110.39] + - [694, 4110.39] - - [128, 256, 64, 1152] - - [557, 9804.97] + - [652, 9804.97] - - [256, 324, 64, 32] - - [570, 5043.73] + - [665, 5043.73] - - [160, 224, 64, 128] - - [584, 6046.25] + - [679, 6046.25] - - [192, 224, 36, 2592] - - [582, 8878.78] + - [677, 8878.78] - - [96, 96, 64, 1152] - - [560, 8035.55] + - [655, 8035.55] - - [32, 64, 36, 90] - - [517, 964.565] + - [612, 964.565] - - [64, 128, 64, 2888] - - [532, 9047.33] + - [627, 9047.33] - - [256, 384, 36, 800] - - [592, 9154.12] + - [687, 9154.12] - - [512, 512, 64, 4] - - [589, 1233.72] + - [684, 1233.72] - - [192, 320, 36, 128] - - [529, 7388.29] + - [624, 7388.29] - - [64, 128, 36, 480] - - [593, 5653.37] + - [688, 5653.37] - - [192, 384, 64, 242] - - [592, 9080.09] + - [687, 9080.09] - - [256, 486, 64, 32] - - [585, 5909.28] + - [680, 5909.28] - - [147, 64, 64, 9702] - - [594, 7319.79] + - [689, 7319.79] - - [512, 512, 64, 64] - - [528, 8179.12] + - [623, 8179.12] - - [64, 192, 64, 3698] - - [521, 9287.99] + - [616, 9287.99] - - [73, 192, 64, 10439] - - [552, 6668.12] + - [647, 6668.12] - - [1, 16, 36, 1440] - - [568, 33.5452] + - [663, 33.5452] - - [128, 256, 36, 512] - - [535, 7989.25] + - [630, 7989.25] - - [512, 512, 64, 576] - - [567, 9951.99] + - [662, 9951.99] - - [64, 64, 36, 12544] - - [583, 5872.87] + - [678, 5872.87] - - [128, 128, 36, 880] - - [580, 7597.36] + - [675, 7597.36] - - [192, 224, 36, 128] - - [538, 6451.3] + - [633, 6451.3] - - [64, 64, 64, 800] - - [578, 6916.83] + - [673, 6916.83] - - [64, 128, 36, 12544] - - [556, 6395.98] + - [651, 6395.98] - - [64, 64, 36, 1568] - - [530, 5536.76] + - [625, 5536.76] - - [160, 160, 36, 512] - - [530, 7345.36] + - [625, 7345.36] - - [512, 24, 64, 512] - - [520, 5242.98] + - [615, 5242.98] - - [3, 64, 36, 3136] - - [545, 475.452] + - [640, 475.452] - - [256, 256, 64, 9] - - [570, 2106.61] + - [665, 2106.61] - - [3, 64, 64, 11552] - - [573, 785.227] + - [668, 785.227] - - [128, 256, 36, 12544] - - [588, 8792.23] + - [683, 8792.23] - - [128, 128, 36, 3136] - - [549, 8098.56] + - [644, 8098.56] - - [256, 512, 36, 3136] - - [532, 9694.49] + - [627, 9694.49] - - [64, 64, 36, 196] - - [546, 2757.86] + - [641, 2757.86] - - [144, 288, 36, 512] - - [580, 7077.99] + - [675, 7077.99] - - [256, 24, 64, 32] - - [559, 1483.93] + - [654, 1483.93] - - [384, 384, 36, 800] - - [521, 9246.6] + - [616, 9246.6] - - [512, 512, 64, 1600] - - [567, 10277.4] + - [662, 10277.4] - - [112, 224, 36, 512] - - [535, 6744.88] + - [630, 6744.88] - - [128, 128, 36, 49] - - [541, 2716.39] + - [636, 2716.39] - - [512, 512, 36, 4] - - [569, 1156.62] + - [664, 1156.62] - - [35, 96, 64, 4235] - - [530, 4631.38] + - [625, 4631.38] - - [192, 384, 64, 450] - - [521, 9372.3] + - [616, 9372.3] - - [256, 256, 36, 1024] - - [592, 9346.74] + - [687, 9346.74] - - [112, 224, 64, 1152] - - [535, 7524.05] + - [630, 7524.05] - - [256, 512, 64, 400] - - [554, 9598.05] + - [649, 9598.05] - - [149, 32, 36, 19072] - - [599, 5811.9] + - [694, 5811.9] - - [128, 256, 36, 6272] - - [535, 8754.78] + - [630, 8754.78] - - [128, 192, 36, 1568] - - [560, 8195.2] + - [655, 8195.2] - - [256, 256, 36, 512] - - [592, 9074.32] + - [687, 9074.32] - - [256, 256, 64, 112] - - [592, 8305.65] + - [687, 8305.65] - - [512, 512, 64, 18] - - [585, 4324.12] + - [680, 4324.12] - - [256, 256, 64, 18] - - [548, 3547.91] + - [643, 3547.91] - - [256, 256, 64, 1568] - - [557, 10141.8] + - [652, 10141.8] - - [64, 96, 36, 1568] - - [578, 6805.76] + - [673, 6805.76] - - [384, 256, 36, 4096] - - [592, 9311.2] + - [687, 9311.2] - - [256, 512, 64, 800] - - [567, 9998.45] + - [662, 9998.45] - - [256, 384, 36, 2048] - - [592, 9285.44] + - [687, 9285.44] - - [3, 64, 36, 200704] - - [574, 547.475] + - [669, 547.475] - - [384, 384, 64, 2304] - - [515, 9901.78] + - [610, 9901.78] - - [160, 320, 64, 128] - - [551, 7113.91] + - [646, 7113.91] - - [512, 512, 36, 528] - - [521, 9567.75] + - [616, 9567.75] - - [160, 320, 36, 128] - - [552, 6411.23] + - [647, 6411.23] - - [96, 96, 64, 800] - - [560, 7690.11] + - [655, 7690.11] - - [256, 512, 36, 49] - - [548, 6721.35] + - [643, 6721.35] - - [384, 384, 64, 450] - - [521, 9523.63] + - [616, 9523.63] - - [3, 64, 64, 23104] - - [573, 801.721] + - [668, 801.721] - - [256, 256, 64, 3200] - - [557, 10300.5] + - [652, 10300.5] - - [128, 192, 36, 512] - - [535, 7499.85] + - [630, 7499.85] - - [192, 192, 64, 288] - - [592, 8774.34] + - [687, 8774.34] - - [96, 208, 64, 242] - - [552, 5902.09] + - [647, 5902.09] - - [256, 16, 36, 3200] - - [581, 3807.87] + - [676, 3807.87] - - [512, 512, 64, 8] - - [559, 2379.85] + - [654, 2379.85] - - [64, 128, 64, 5776] - - [532, 9332.84] + - [627, 9332.84] - - [512, 512, 64, 288] - - [521, 9522.09] + - [616, 9522.09] - - [256, 16, 36, 32] - - [577, 766.105] + - [672, 766.105] - - [128, 192, 64, 288] - - [592, 8527.68] + - [687, 8527.68] - - [32, 64, 64, 640] - - [560, 4660.44] + - [655, 4660.44] - - [64, 64, 36, 392] - - [560, 3686.5] + - [655, 3686.5] - - [384, 384, 36, 1024] - - [526, 9282.58] + - [621, 9282.58] - - [64, 64, 36, 11552] - - [590, 5904.88] + - [685, 5904.88] - - [96, 128, 36, 6272] - - [580, 8351.09] + - [675, 8351.09] - - [128, 256, 36, 16] - - [562, 2144.91] + - [657, 2144.91] - - [256, 256, 64, 288] - - [592, 9140.23] + - [687, 9140.23] - - [64, 64, 64, 1652] - - [578, 7766.63] + - [673, 7766.63] - - [256, 384, 36, 1024] - - [526, 9203.37] + - [621, 9203.37] - - [96, 128, 64, 3200] - - [595, 8866.3] + - [690, 8866.3] - - [256, 324, 36, 3200] - - [534, 8194.35] + - [629, 8194.35] - - [128, 192, 64, 800] - - [592, 9198.13] + - [687, 9198.13] - - [64, 128, 64, 10] - - [533, 851.217] + - [628, 851.217] - - [96, 208, 64, 288] - - [560, 6667.68] + - [655, 6667.68] - - [64, 96, 36, 2592] - - [542, 7216.98] + - [637, 7216.98] - - [64, 128, 64, 160] - - [571, 5191.07] + - [666, 5191.07] - - [192, 384, 64, 512] - - [521, 9446.14] + - [616, 9446.14] - - [64, 64, 36, 6272] - - [530, 6212.11] + - [625, 6212.11] - - [512, 24, 36, 288] - - [527, 3922.57] + - [622, 3922.57] - - [128, 128, 64, 1568] - - [521, 9037.96] + - [616, 9037.96] - - [112, 224, 64, 242] - - [591, 6399.36] + - [686, 6399.36] - - [128, 256, 64, 1600] - - [557, 10010.4] + - [652, 10010.4] - - [32, 32, 64, 20000] - - [525, 4378.51] + - [620, 4378.51] - - [160, 192, 64, 288] - - [552, 7803.73] + - [647, 7803.73] - - [512, 24, 64, 128] - - [513, 3733.9] + - [608, 3733.9] - - [512, 512, 36, 32] - - [548, 5935.44] + - [643, 5935.44] - - [3, 64, 36, 100352] - - [545, 542.883] + - [640, 542.883] - - [3, 64, 64, 1444] - - [574, 674.259] + - [669, 674.259] - - [512, 512, 36, 3136] - - [515, 9921.2] + - [610, 9921.2] - - [128, 256, 64, 6400] - - [575, 10349.4] + - [670, 10349.4] - - [256, 256, 36, 2048] - - [592, 9519.09] + - [687, 9519.09] - - [128, 160, 64, 288] - - [535, 7549.85] + - [630, 7549.85] - - [256, 256, 64, 6400] - - [557, 10392.7] + - [652, 10392.7] - - [32, 64, 64, 20000] - - [583, 6493.96] + - [678, 6493.96] - - [256, 256, 36, 1680] - - [532, 9513.39] + - [627, 9513.39] - - [128, 128, 64, 210] - - [592, 7094.2] + - [687, 7094.2] - - [192, 384, 36, 2048] - - [521, 8818.75] + - [616, 8818.75] - - [256, 256, 64, 144] - - [592, 8608.71] + - [687, 8608.71] - - [384, 384, 36, 4096] - - [526, 9357.04] + - [621, 9357.04] - - [160, 320, 64, 1152] - - [552, 8749.58] + - [647, 8749.58] - - [384, 256, 36, 2048] - - [592, 9279.73] + - [687, 9279.73] - - [256, 512, 36, 392] - - [592, 9252.24] + - [687, 9252.24] - - [256, 512, 64, 50] - - [548, 7511.39] + - [643, 7511.39] - - [73, 192, 36, 23360] - - [596, 5803.03] + - [691, 5803.03] - - [3, 64, 36, 50176] - - [545, 542.137] + - [640, 542.137] - - [384, 384, 36, 2048] - - [521, 9325.9] + - [616, 9325.9] - - [256, 384, 64, 450] - - [567, 9528.76] + - [662, 9528.76] - - [192, 320, 64, 128] - - [526, 8399.91] + - [621, 8399.91] - - [128, 256, 36, 32] - - [541, 3276.9] + - [636, 3276.9] - - [160, 192, 36, 512] - - [580, 7752.44] + - [675, 7752.44] - - [512, 512, 64, 256] - - [532, 9473.74] + - [627, 9473.74] - - [256, 512, 64, 32] - - [570, 6391.42] + - [665, 6391.42] - - [384, 384, 64, 576] - - [521, 9614.89] + - [616, 9614.89] - - [64, 64, 64, 648] - - [578, 6282.25] + - [673, 6282.25] - - [512, 486, 36, 288] - - [592, 8625.03] + - [687, 8625.03] - - [32, 64, 36, 1440] - - [530, 3961.6] + - [625, 3961.6] - - [144, 288, 64, 242] - - [552, 6347.12] + - [647, 6347.12] - - [384, 256, 64, 576] - - [557, 9775.34] + - [652, 9775.34] - - [512, 512, 36, 64] - - [528, 7791.38] + - [623, 7791.38] - - [448, 384, 64, 128] - - [521, 9132.33] + - [616, 9132.33] - - [64, 128, 64, 722] - - [571, 8047.21] + - [666, 8047.21] - - [144, 288, 64, 288] - - [580, 6859.5] + - [675, 6859.5] - - [512, 512, 64, 224] - - [592, 9427.39] + - [687, 9427.39] - - [112, 224, 64, 288] - - [591, 6737.02] + - [686, 6737.02] - - [384, 384, 64, 1152] - - [515, 9820.56] + - [610, 9820.56] - - [448, 384, 36, 128] - - [592, 8761.41] + - [687, 8761.41] - - [64, 64, 64, 100] - - [538, 2708.2] + - [633, 2708.2] - - [256, 486, 36, 128] - - [564, 7640.14] + - [659, 7640.14] - - [64, 96, 64, 4608] - - [593, 8351.59] + - [688, 8351.59] - - [16, 32, 64, 160] - - [517, 736.46] + - [612, 736.46] - - [64, 192, 36, 6272] - - [593, 8041.29] + - [688, 8041.29] - - [64, 64, 64, 200] - - [546, 3924.41] + - [641, 3924.41] - - [256, 256, 36, 800] - - [592, 9299.65] + - [687, 9299.65] - - [64, 128, 36, 6272] - - [590, 6816.46] + - [685, 6816.46] - - [32, 64, 64, 40] - - [537, 885.722] + - [632, 885.722] - - [256, 16, 64, 32] - - [587, 1205.36] + - [682, 1205.36] - - [192, 384, 36, 800] - - [526, 8673.98] + - [621, 8673.98] - - [128, 128, 36, 3200] - - [560, 8538.99] + - [655, 8538.99] - - [256, 256, 36, 256] - - [532, 8454.46] + - [627, 8454.46] - - [192, 384, 64, 1152] - - [521, 9589.11] + - [616, 9589.11] - - [128, 256, 64, 200] - - [531, 8141.22] + - [626, 8141.22] - - [64, 96, 64, 1152] - - [560, 7620.98] + - [655, 7620.98] - - [128, 128, 36, 392] - - [535, 6175.61] + - [630, 6175.61] - - [80, 192, 36, 10368] - - [583, 6497.26] + - [678, 6497.26] - - [224, 224, 36, 128] - - [593, 5826.99] + - [688, 5826.99] - - [512, 512, 64, 28] - - [548, 5728.91] + - [643, 5728.91] - - [256, 16, 64, 1568] - - [563, 4637.3] + - [658, 4637.3] - - [144, 288, 64, 1152] - - [580, 7784.34] + - [675, 7784.34] - - [256, 256, 64, 576] - - [557, 9596.22] + - [652, 9596.22] - - [64, 128, 36, 784] - - [593, 6059.09] + - [688, 6059.09] - - [256, 24, 36, 128] - - [527, 2239.94] + - [622, 2239.94] - - [256, 256, 64, 2304] - - [557, 10225.8] + - [652, 10225.8] - - [192, 384, 36, 512] - - [592, 8549.13] + - [687, 8549.13] - - [16, 32, 64, 2560] - - [545, 2153.23] + - [640, 2153.23] - - [256, 512, 36, 32] - - [570, 5702.33] + - [665, 5702.33] - - [512, 512, 64, 128] - - [592, 9084.21] + - [687, 9084.21] - - [128, 128, 64, 200] - - [529, 6972.01] + - [624, 6972.01] - - [512, 512, 64, 32] - - [541, 6248.6] + - [636, 6248.6] - - [128, 256, 36, 196] - - [541, 6628.86] + - [636, 6628.86] - - [8, 384, 64, 6600] - - [573, 2733.99] + - [668, 2733.99] - - [149, 32, 64, 8195] - - [535, 6051.01] + - [630, 6051.01] - - [35, 96, 64, 6160] - - [580, 4689.45] + - [675, 4689.45] - - [64, 64, 36, 1760] - - [530, 5622.34] + - [625, 5622.34] + - - [196, 528, 32, 32] + - [708, 4088.51] + - - [5329, 64, 32, 80] + - [701, 8331.24] + - - [64, 2880, 1, 320] + - [752, 4362.7] + - - [49, 832, 32, 256] + - [715, 5618.73] + - - [3136, 64, 64, 64] + - [701, 8457.75] + - - [196, 512, 32, 24] + - [702, 3621.83] + - - [289, 1120, 1, 160] + - [698, 3302.96] + - - [1225, 192, 32, 32] + - [706, 6194.67] + - - [64, 2048, 32, 384] + - [729, 9541.64] + - - [1001, 1536, 1, 32] + - [700, 3575.77] + - - [289, 1792, 1, 320] + - [723, 5140.43] + - - [3136, 256, 64, 64] + - [724, 9310.22] + - - [1001, 1024, 1, 32] + - [695, 2733.5] + - - [196, 480, 32, 64] + - [756, 5070.52] + - - [64, 1728, 1, 320] + - [753, 3205.67] + - - [49, 832, 32, 160] + - [757, 4988.92] + - - [49, 2048, 64, 512] + - [727, 7370.41] + - - [49, 832, 32, 384] + - [715, 5902.05] + - - [289, 896, 1, 192] + - [741, 3452.69] + - - [289, 1024, 32, 384] + - [760, 8902.52] + - - [784, 192, 32, 96] + - [771, 7853.73] + - - [50176, 256, 1, 128] + - [734, 9041.93] + - - [289, 1024, 32, 256] + - [769, 8660.82] + - - [289, 1024, 32, 192] + - [758, 8433.45] + - - [12544, 512, 1, 256] + - [718, 9187.44] + - - [1225, 1728, 1, 192] + - [722, 7720.95] + - - [196, 480, 32, 96] + - [767, 5662.6] + - - [196, 512, 32, 144] + - [761, 6531.48] + - - [784, 400, 1, 32] + - [696, 1280.1] + - - [289, 768, 32, 128] + - [762, 7913.71] + - - [5329, 576, 1, 96] + - [705, 7563.56] + - - [49, 1200, 1, 128] + - [749, 1011.71] + - - [64, 1536, 32, 256] + - [763, 9159.64] + - - [289, 2592, 1, 384] + - [731, 6002.81] + - - [196, 528, 32, 128] + - [766, 5987.2] + - - [64, 2048, 32, 448] + - [729, 9669.97] + - - [196, 1024, 64, 256] + - [768, 7819.04] + - - [5329, 448, 1, 64] + - [701, 6201.12] + - - [784, 256, 32, 64] + - [703, 7623.28] + - - [784, 192, 32, 32] + - [708, 5874.36] + - - [21609, 288, 1, 32] + - [721, 5296.6] + - - [784, 256, 32, 32] + - [699, 6235.56] + - - [5041, 720, 1, 192] + - [717, 8141.08] + - - [289, 2016, 1, 256] + - [714, 5404.15] + - - [196, 512, 32, 128] + - [759, 6366.92] + - - [289, 768, 32, 160] + - [761, 8253.98] + - - [64, 1536, 32, 384] + - [732, 9508.6] + - - [64, 1280, 32, 320] + - [732, 9070.83] + - - [289, 896, 1, 128] + - [742, 2917.78] + - - [289, 3456, 1, 384] + - [722, 7275.01] + - - [196, 800, 1, 64] + - [744, 1393.88] + - - [64, 1280, 32, 384] + - [728, 9225.11] + - - [64, 1344, 1, 512] + - [747, 3041.55] + - - [1001, 4096, 1, 512] + - [728, 9391.87] + - - [1225, 192, 32, 64] + - [701, 7729.39] + - - [64, 1152, 1, 384] + - [751, 2440.75] + - - [729, 1600, 1, 192] + - [713, 6827.81] + - - [289, 1344, 1, 192] + - [711, 4439.14] + - - [784, 192, 32, 16] + - [738, 3663.14] + - - [3136, 1024, 1, 2048] + - [720, 9071.87] + - - [64, 1152, 1, 448] + - [748, 2564.55] + - - [49, 832, 32, 128] + - [711, 4733.26] + - - [784, 256, 32, 128] + - [724, 8471.7] + - - [49, 800, 1, 128] + - [746, 633.635] + - - [196, 512, 32, 32] + - [708, 4354.36] + - - [1225, 384, 32, 96] + - [725, 8751.73] + - - [5041, 576, 1, 96] + - [707, 7067.73] + - - [49, 832, 32, 48] + - [740, 3316.82] + - - [3136, 64, 64, 256] + - [762, 9722.0] + - - [5329, 160, 32, 64] + - [764, 8159.94] + - - [1225, 288, 32, 48] + - [754, 6673.75] + - - [4096, 9216, 1, 512] + - [736, 10117.0] + - - [196, 480, 32, 192] + - [765, 6388.56] + - - [64, 1152, 1, 256] + - [752, 1982.7] + - - [3136, 1024, 1, 512] + - [720, 8745.67] + - - [49, 832, 32, 32] + - [739, 2717.97] + - - [784, 192, 32, 64] + - [703, 7216.42] + - - [289, 1024, 32, 128] + - [726, 7970.6] + - - [289, 768, 32, 192] + - [770, 8327.37] + - - [289, 1120, 1, 192] + - [710, 3717.0] + - - [196, 512, 32, 112] + - [716, 6252.91] + - - [1001, 2048, 1, 32] + - [704, 4000.19] + - - [1225, 288, 32, 64] + - [764, 7208.14] + - - [196, 600, 1, 64] + - [743, 1094.05] + - - [1225, 384, 32, 192] + - [725, 9332.76] + - - [50176, 256, 1, 512] + - [735, 9833.64] + - - [196, 512, 32, 160] + - [762, 6614.44] + - - [4096, 4096, 1, 512] + - [733, 10032.3] + - - [49, 832, 32, 192] + - [711, 5244.63] + - - [1225, 256, 32, 64] + - [701, 7972.45] + - - [64, 2048, 32, 320] + - [729, 9404.37] + - - [196, 480, 32, 16] + - [755, 2724.59] + - - [1225, 256, 32, 48] + - [703, 7100.48] + - - [64, 1280, 32, 448] + - [728, 9344.51] + - - [1225, 1200, 1, 64] + - [697, 5157.99] + - - [1225, 384, 32, 64] + - [701, 8220.06] + - - [12544, 512, 1, 1024] + - [720, 9672.82] + - - [64, 1280, 32, 192] + - [716, 8525.11] + - - [196, 512, 32, 64] + - [701, 5489.44] + - - [289, 1792, 1, 256] + - [719, 4831.71] + - - [196, 528, 32, 256] + - [737, 6453.92] + - - [49, 512, 64, 2048] + - [772, 7549.08] + - - [64, 2048, 32, 192] + - [724, 8955.91] + - - [784, 512, 64, 128] + - [724, 9160.83] + - - [784, 128, 64, 512] + - [731, 9280.79] + - - [196, 528, 32, 160] + - [765, 6161.25] + - - [1225, 192, 32, 48] + - [701, 7237.02] + - - [64, 1728, 1, 192] + - [751, 2480.67] + - - [1001, 2048, 1, 64] + - [777, 5714.52] + - - [5329, 64, 128, 80] + - [784, 8835.39] + - - [64, 1280, 128, 448] + - [782, 10020.6] + - - [289, 768, 128, 128] + - [785, 8542.81] + - - [1225, 192, 128, 64] + - [774, 8444.87] + - - [1225, 288, 128, 48] + - [787, 7244.76] + - - [289, 768, 128, 192] + - [789, 8794.59] + - - [289, 768, 128, 160] + - [786, 8705.43] + - - [64, 2048, 128, 192] + - [780, 9780.36] + - - [64, 1280, 128, 384] + - [783, 9951.0] + - - [1225, 256, 128, 48] + - [775, 8273.71] + - - [1225, 192, 128, 48] + - [775, 8140.42] + - - [1225, 288, 128, 64] + - [787, 7886.31] + - - [64, 1280, 128, 320] + - [779, 9894.66] + - - [1225, 256, 128, 64] + - [780, 8572.61] + - - [1001, 2048, 1, 128] + - [781, 7289.16] + - - [1225, 192, 128, 32] + - [776, 7104.67] + - - [64, 1280, 128, 192] + - [788, 9642.18] + - - [1001, 1536, 1, 64] + - [778, 5146.66] - null diff --git a/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bljk_SB.yaml index 096950937..3a6e9917c 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bljk_SB.yaml @@ -98605,15 +98605,15 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -98621,7 +98621,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -98629,37 +98629,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 4 - LSPB: 16 + LSPB: 32 LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -98667,30 +98668,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -98698,6 +98708,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -98707,6 +98718,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -98716,53 +98728,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 621 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -98770,7 +98793,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -98778,37 +98801,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 32 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 4 - LSPB: 16 + LSPB: 32 LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -98816,30 +98840,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -98847,6 +98880,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -98856,6 +98890,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -98865,47 +98900,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 622 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -98917,9 +98963,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -98927,75 +98973,82 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 4 LSPB: 16 - LVCA: 32 + LVCA: 64 LVCB: 16 - LVPA: 2 + LVPA: 4 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdcEqualsLdd: true + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99005,6 +99058,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -99014,95 +99068,111 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 623 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id004 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2560 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -99111,36 +99181,46 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99150,6 +99230,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -99159,95 +99240,111 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 624 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4352 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -99255,37 +99352,47 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 8 - MacroTileA: 128 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99295,6 +99402,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -99304,53 +99412,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 625 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id003 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -99359,7 +99478,7 @@ ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -99367,63 +99486,73 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 16 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4352 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 8 - MacroTileA: 128 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: @@ -99431,6 +99560,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99440,6 +99570,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -99449,28 +99580,38 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 626 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 - SubGroup0: 32 - SubGroup1: 2 - SubGroupA: 32 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -99481,30 +99622,31 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id007 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -99512,70 +99654,85 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 16 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 64 LVCA: 16 - LVCB: 32 + LVCB: 4 LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4096 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99585,6 +99742,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -99594,28 +99752,38 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 627 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 2 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -99626,98 +99794,107 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id006 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 16 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3088 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -99725,6 +99902,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99734,6 +99912,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -99743,28 +99922,38 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 628 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 2 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -99774,64 +99963,72 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id006 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 32 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2304 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -99840,36 +100037,44 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99879,6 +100084,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -99888,62 +100094,75 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 629 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id008 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -99951,70 +100170,83 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4352 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 8 - MacroTileA: 128 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100024,6 +100256,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -100033,28 +100266,38 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 630 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 - SubGroup0: 32 - SubGroup1: 2 - SubGroupA: 32 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -100065,21 +100308,24 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id007 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -100087,7 +100333,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -100095,75 +100341,86 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 32 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100173,6 +100430,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -100182,133 +100440,159 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 631 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG08_04_08 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 2560 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100318,6 +100602,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -100327,53 +100612,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 632 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id008 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -100381,7 +100677,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -100389,33 +100685,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 4352 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3136 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -100424,29 +100721,38 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 4 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 4 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: @@ -100454,6 +100760,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100463,6 +100770,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -100472,95 +100780,111 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 633 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_02_04 - SubGroup0: 32 - SubGroup1: 2 - SubGroupA: 32 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id008 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id007 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 8 LSPA: 8 - LSPB: 4 + LSPB: 128 LVCA: 32 - LVCB: 64 + LVCB: 2 LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 4352 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -100569,36 +100893,46 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 4 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 4 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100608,6 +100942,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -100617,53 +100952,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 634 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_02_08 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 2 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id006 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -100671,45 +101017,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 48 - LSCB: 16 + LSCA: 128 + LSCB: 8 LSPA: 4 - LSPB: 12 - LVCA: 48 - LVCB: 16 - LVPA: 4 - LVPB: 12 - LdcEqualsLdd: false - LdsNumElements: 3456 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 576 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -100717,30 +101064,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 36 - MacroTileA: 48 - MacroTileB: 36 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 192 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -100748,6 +101104,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100757,6 +101114,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -100766,47 +101124,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 635 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x036x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT06_03_USFGRO01_VW01_WG08_12_02 - SubGroup0: 8 - SubGroup1: 12 - SubGroupA: 8 - SubGroupB: 12 - SuppresssNoLoadLoop: false - ThreadTile: [6, 3] - ThreadTile0: 6 - ThreadTile1: 3 - ThreadTileA: 6 - ThreadTileB: 3 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 635 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 12, 2] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -100818,41 +101187,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 12 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 12 - LVCA: 12 - LVCB: 16 - LVPA: 16 - LVPB: 12 - LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 768 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 576 - LdsOffsetB_Blk: 2624 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -100866,37 +101232,47 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 36 - MacroTile1: 48 - MacroTileA: 36 - MacroTileB: 48 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 192 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100906,6 +101282,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -100915,53 +101292,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 636 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT036x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG12_16_01 - SubGroup0: 12 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 12 + SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 3 - ThreadTile1: 3 - ThreadTileA: 3 - ThreadTileB: 3 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [12, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -100969,39 +101357,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true LdsNumElements: 3584 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 768 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -101014,38 +101403,48 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 48 - MacroTile1: 48 - MacroTileA: 48 - MacroTileB: 48 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101055,6 +101454,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -101064,53 +101464,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 637 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG16_16_01 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 3 - ThreadTile1: 3 - ThreadTileA: 3 - ThreadTileB: 3 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -101118,7 +101529,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -101126,37 +101537,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 48 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 12 - LVCA: 24 - LVCB: 16 - LVPA: 4 - LVPB: 6 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 768 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -101164,30 +101576,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 24 - MacroTileA: 48 - MacroTileB: 24 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 6 - NumGlobalWriteVectorsPerThread: 3 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101195,6 +101616,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101204,6 +101626,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -101213,53 +101636,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 638 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 - SubGroup0: 8 - SubGroup1: 6 - SubGroupA: 8 - SubGroupB: 6 - SuppresssNoLoadLoop: false - ThreadTile: *id011 - ThreadTile0: 6 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id010 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -101267,45 +101701,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 24 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 6 - LVCA: 24 - LVCB: 32 - LVPA: 8 - LVPB: 6 - LdcEqualsLdd: false + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true LdsNumElements: 3584 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 768 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -101313,30 +101748,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 24 - MacroTile1: 24 - MacroTileA: 24 - MacroTileB: 24 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 3 - NumGlobalWriteVectorsPerThread: 3 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101344,6 +101788,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101353,6 +101798,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -101362,53 +101808,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 639 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT024x024x32_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_04_USFGRO01_VW01_WG08_06_04 - SubGroup0: 8 - SubGroup1: 6 - SubGroupA: 8 - SubGroupB: 6 - SuppresssNoLoadLoop: false - ThreadTile: [3, 4] - ThreadTile0: 3 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 3 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id010 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -101416,7 +101873,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -101424,37 +101881,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 48 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 12 - LVCA: 24 - LVCB: 16 - LVPA: 4 - LVPB: 6 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 768 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -101462,37 +101920,47 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 24 - MacroTileA: 48 - MacroTileB: 24 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 6 - NumGlobalWriteVectorsPerThread: 3 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101502,6 +101970,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -101511,49 +101980,60 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 640 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 - SubGroup0: 8 - SubGroup1: 6 - SubGroupA: 8 - SubGroupB: 6 - SuppresssNoLoadLoop: false - ThreadTile: *id011 - ThreadTile0: 6 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id010 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -101563,43 +102043,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 8 - LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 832 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101611,30 +102092,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101642,6 +102132,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101651,6 +102142,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -101660,95 +102152,107 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 641 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 8 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 LSPA: 8 - LSPB: 16 - LVCA: 8 + LSPB: 64 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101759,31 +102263,40 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101791,6 +102304,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101800,6 +102314,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -101809,49 +102324,60 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 642 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -101861,41 +102387,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 8 - LVCB: 2 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 LVPA: 2 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -101909,30 +102436,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101940,6 +102474,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101949,6 +102484,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -101958,49 +102494,62 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 643 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -102010,41 +102559,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 8 LSPA: 4 - LSPB: 32 - LVCA: 16 - LVCB: 2 - LVPA: 1 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -102058,30 +102608,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102089,6 +102646,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102098,6 +102656,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -102107,49 +102666,62 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 644 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -102159,41 +102731,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -102207,30 +102780,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102238,6 +102818,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102247,6 +102828,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -102256,49 +102838,62 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 645 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -102308,41 +102903,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 8 - LSPA: 4 - LSPB: 16 + LSPA: 8 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -102356,30 +102952,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102387,6 +102990,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102396,6 +103000,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -102405,49 +103010,62 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 646 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -102457,9 +103075,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -102467,33 +103085,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 16 - LVCA: 8 + LSPB: 64 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102505,30 +103124,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102536,6 +103162,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102545,6 +103172,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -102554,49 +103182,62 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 647 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -102606,41 +103247,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 8 - LSPA: 4 - LSPB: 16 - LVCA: 16 - LVCB: 4 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 LVPA: 2 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -102654,30 +103296,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102685,6 +103336,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102694,6 +103346,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -102703,49 +103356,60 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 648 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -102755,47 +103419,22670 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 649 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 650 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 LVCB: 2 LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 651 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1824 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 652 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1824 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 653 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 800 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 654 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1680 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 192 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 655 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 656 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 657 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 1 + LSPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1296 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 658 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB2_PGR0_PLR1_TT8_4_USFGRO1_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 1 + LSPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1312 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 659 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 1 + LSPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1312 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 660 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 661 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR0_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 662 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 663 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 664 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 665 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 32 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 666 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 667 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 668 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 669 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 670 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 671 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 672 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 673 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 674 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 675 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 676 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW2_GSU8_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 677 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 678 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 679 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 680 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 681 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 682 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 683 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 684 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 685 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 686 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 687 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 688 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 689 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 690 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 691 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 692 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 693 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 694 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 695 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 696 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 697 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 698 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 699 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 700 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 + SubGroup0: 32 + SubGroup1: 2 + SubGroupA: 32 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id007 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 701 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 + SubGroup0: 16 + SubGroup1: 2 + SubGroupA: 16 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 702 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 + SubGroup0: 16 + SubGroup1: 2 + SubGroupA: 16 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2304 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 703 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id008 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 704 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 + SubGroup0: 32 + SubGroup1: 2 + SubGroupA: 32 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id007 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 705 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG08_04_08 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 706 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id008 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 707 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_02_04 + SubGroup0: 32 + SubGroup1: 2 + SubGroupA: 32 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id008 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id007 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 708 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_02_08 + SubGroup0: 16 + SubGroup1: 2 + SubGroupA: 16 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 48 + LSCB: 16 + LSPA: 4 + LSPB: 12 + LVCA: 48 + LVCB: 16 + LVPA: 4 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 3456 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 36 + MacroTileA: 48 + MacroTileB: 36 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 709 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x036x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT06_03_USFGRO01_VW01_WG08_12_02 + SubGroup0: 8 + SubGroup1: 12 + SubGroupA: 8 + SubGroupB: 12 + SuppresssNoLoadLoop: false + ThreadTile: [6, 3] + ThreadTile0: 6 + ThreadTile1: 3 + ThreadTileA: 6 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 12, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 12 + LSCB: 16 + LSPA: 16 + LSPB: 12 + LVCA: 12 + LVCB: 16 + LVPA: 16 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 3392 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 36 + MacroTile1: 48 + MacroTileA: 36 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 710 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT036x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG12_16_01 + SubGroup0: 12 + SubGroup1: 16 + SubGroupA: 12 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 3 + ThreadTile1: 3 + ThreadTileA: 3 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [12, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 48 + MacroTile1: 48 + MacroTileA: 48 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 711 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 3 + ThreadTile1: 3 + ThreadTileA: 3 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 48 + LSCB: 32 + LSPA: 8 + LSPB: 12 + LVCA: 24 + LVCB: 16 + LVPA: 4 + LVPB: 6 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 24 + MacroTileA: 48 + MacroTileB: 24 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 712 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 + SubGroup0: 8 + SubGroup1: 6 + SubGroupA: 8 + SubGroupB: 6 + SuppresssNoLoadLoop: false + ThreadTile: *id011 + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id010 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 24 + LSCB: 32 + LSPA: 8 + LSPB: 6 + LVCA: 24 + LVCB: 32 + LVPA: 8 + LVPB: 6 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 24 + MacroTile1: 24 + MacroTileA: 24 + MacroTileB: 24 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 3 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 713 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT024x024x32_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_04_USFGRO01_VW01_WG08_06_04 + SubGroup0: 8 + SubGroup1: 6 + SubGroupA: 8 + SubGroupB: 6 + SuppresssNoLoadLoop: false + ThreadTile: [3, 4] + ThreadTile0: 3 + ThreadTile1: 4 + ThreadTileA: 3 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: *id010 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 48 + LSCB: 32 + LSPA: 8 + LSPB: 12 + LVCA: 24 + LVCB: 16 + LVPA: 4 + LVPB: 6 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 24 + MacroTileA: 48 + MacroTileB: 24 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 714 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 + SubGroup0: 8 + SubGroup1: 6 + SubGroupA: 8 + SubGroupB: 6 + SuppresssNoLoadLoop: false + ThreadTile: *id011 + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id010 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 715 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 716 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 717 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 718 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 719 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 720 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 721 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 722 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 723 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 724 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 725 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 726 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 727 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 728 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 729 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 730 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 731 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 732 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 384 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 6 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 733 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x24_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 734 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 735 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 736 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 737 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 738 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 739 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id025 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 740 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 741 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 742 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 743 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 744 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 745 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 746 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 747 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id024 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 748 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id025 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 749 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 750 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 751 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 752 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 753 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 754 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 755 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 756 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 757 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 758 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 759 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id025 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 760 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 761 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 762 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 763 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 764 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 765 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 766 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 767 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 768 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 769 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 770 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 771 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 772 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 773 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 774 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 775 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 776 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 777 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 778 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id024 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 779 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 780 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 781 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 782 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 783 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 784 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 785 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 786 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 787 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 788 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 789 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 790 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 791 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 792 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 793 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -102804,9 +126091,299 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 794 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 795 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102822,12 +126399,12 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -102866,14 +126443,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 649 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 796 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id016 + ThreadTile: *id029 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -102884,8 +126461,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 8 + WorkGroup: *id028 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -102898,7 +126475,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102906,39 +126483,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 16 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -102951,11 +126528,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102963,15 +126540,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -103015,26 +126592,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 650 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 + SolutionIndex: 797 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -103047,7 +126624,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -103055,39 +126632,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 2 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103100,11 +126677,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103112,15 +126689,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -103164,26 +126741,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 651 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SolutionIndex: 798 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id015 + ThreadTile: *id031 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -103205,7 +126782,7 @@ ExpandPointerSwap: false FractionalLoad: false GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -103218,25 +126795,21 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 64 + LSCA: 32 LSCB: 16 - LSPA: 4 - LSPB: 8 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3200 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3584 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103250,10 +126823,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103261,20 +126834,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103313,25 +126886,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 652 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 + SolutionIndex: 799 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id032 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [32, 4, 1] + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -103353,39 +126926,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 16 + LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103399,10 +126968,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103410,20 +126979,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103462,25 +127031,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 653 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 800 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -103502,39 +127071,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 32 + LSCA: 128 LSCB: 16 - LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103548,10 +127117,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103559,15 +127128,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -103611,25 +127180,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 654 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 801 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -103668,22 +127237,18 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 8 + LSPA: 16 + LSPB: 64 + LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103697,10 +127262,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103710,18 +127275,18 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103760,14 +127325,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 655 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 802 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id016 + ThreadTile: *id029 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -103778,8 +127343,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 1 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -103800,35 +127365,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source LSCA: 64 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -103847,9 +127412,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103857,15 +127422,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -103909,26 +127474,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 656 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SolutionIndex: 803 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id015 + ThreadTile: *id029 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -103968,20 +127533,20 @@ KernelLanguage: Source LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 32 + LSPA: 16 + LSPB: 64 LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103996,9 +127561,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104006,15 +127571,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -104058,26 +127623,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 657 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_08_01 + SolutionIndex: 804 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id016 + ThreadTile: *id031 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -104115,22 +127680,18 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 64 + LSCA: 32 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 LVCB: 8 - LVPA: 2 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -104144,10 +127705,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104155,20 +127716,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -104207,25 +127768,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 658 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SolutionIndex: 805 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id032 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 + WorkGroup: *id028 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -104239,7 +127800,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -104247,39 +127808,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 32 - LSCB: 8 - LSPA: 4 - LSPB: 16 - LVCA: 16 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3200 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 384 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -104292,11 +127853,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104304,15 +127865,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 6 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 6 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -104356,26 +127917,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 659 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x24_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 806 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -104396,39 +127957,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 4 - LSPB: 4 + LSPA: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -104442,10 +127999,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104453,20 +128010,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -104505,96 +128062,92 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 660 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 + SolutionIndex: 807 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 + VectorWidth: 4 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 - LSPB: 16 + KernelLanguage: Source + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104602,20 +128155,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -104654,48 +128207,48 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 661 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 808 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id031 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 + VectorWidth: 4 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: false GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -104703,47 +128256,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly + KernelLanguage: Source LSCA: 128 - LSCB: 8 + LSCB: 32 LSPA: 8 LSPB: 32 LVCA: 32 LVCB: 8 LVPA: 2 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 4096 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 32 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104751,20 +128300,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -104803,96 +128352,92 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 662 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_08_02 + SolutionIndex: 809 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id021 + ThreadTile: *id030 ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id024 - WorkGroupMapping: 8 + WorkGroup: *id028 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 - LSPB: 16 + KernelLanguage: Source + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104900,20 +128445,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -104952,96 +128497,96 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 663 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 810 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 - LSPB: 16 + KernelLanguage: Source + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105049,15 +128594,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -105101,26 +128646,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 664 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 811 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -105133,7 +128678,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -105142,7 +128687,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -105150,36 +128695,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 - LSPB: 16 + LSPB: 128 LVCA: 32 - LVCB: 16 + LVCB: 2 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -105188,9 +128733,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105198,13 +128743,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -105250,25 +128795,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 665 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionIndex: 812 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id025 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -105282,7 +128827,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -105291,7 +128836,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -105299,26 +128844,26 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -105328,18 +128873,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105347,8 +128892,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -105399,26 +128944,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 666 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 813 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -105437,58 +128982,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105496,20 +129037,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -105548,25 +129089,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 667 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 814 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id035 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -105588,56 +129129,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 16 + LSPA: 16 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105645,15 +129186,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -105697,25 +129238,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 668 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SolutionIndex: 815 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id035 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -105735,7 +129276,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -105746,7 +129287,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -105754,39 +129295,35 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 8 + LSPB: 64 + LVCA: 16 LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105794,20 +129331,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -105846,25 +129383,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 669 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 816 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 + ThreadTile: *id036 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -105895,7 +129432,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -105905,37 +129442,37 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 32 + LSPA: 16 + LSPB: 64 LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105943,15 +129480,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -105995,25 +129532,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 670 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SolutionIndex: 817 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -106035,8 +129572,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -106044,47 +129581,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106092,14 +129629,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -106144,25 +129681,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 671 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 818 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id038 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id023 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -106182,58 +129719,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106241,20 +129774,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -106293,25 +129826,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 672 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 819 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -106333,56 +129866,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 16 + LSPB: 64 LVCA: 32 - LVCB: 16 - LVPA: 4 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106390,8 +129923,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -106442,25 +129975,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 673 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SolutionIndex: 820 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id024 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -106482,35 +130015,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6400 + LdsNumElements: 7680 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -106520,18 +130053,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 16 + MacroTile1: 96 MacroTileA: 128 - MacroTileB: 16 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106539,14 +130072,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -106591,26 +130124,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 674 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionIndex: 821 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id039 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id025 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -106629,10 +130162,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -106640,47 +130173,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106688,20 +130217,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -106740,26 +130269,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 675 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 822 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -106781,7 +130310,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -106789,26 +130318,26 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 16 + LSPB: 64 LVCA: 32 - LVCB: 16 + LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -106818,18 +130347,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106837,14 +130366,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -106889,26 +130418,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 676 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 823 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id021 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -106927,58 +130456,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 16 + LSPA: 16 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106986,20 +130511,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -107038,25 +130563,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 677 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 824 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 + ThreadTile: *id035 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -107087,7 +130612,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -107095,39 +130620,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 8 + LSPB: 64 + LVCA: 16 LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107135,15 +130660,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -107187,14 +130712,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 678 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 + SolutionIndex: 825 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id035 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -107205,7 +130730,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -107225,7 +130750,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -107236,7 +130761,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -107246,37 +130771,33 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 32 + LSPA: 16 + LSPB: 64 LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107284,20 +130805,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -107336,25 +130857,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 679 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 826 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -107376,56 +130897,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107433,14 +130954,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -107485,25 +131006,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 680 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 827 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -107526,7 +131047,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -107534,47 +131055,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107582,14 +131103,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -107634,25 +131155,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 681 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SolutionIndex: 828 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id038 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id023 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -107672,58 +131193,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107731,20 +131248,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -107783,25 +131300,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 682 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_08_02 + SolutionIndex: 829 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id024 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -107823,56 +131340,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 16 + LSPB: 64 LVCA: 32 - LVCB: 16 - LVPA: 4 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107880,8 +131397,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -107932,25 +131449,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 683 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SolutionIndex: 830 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id024 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -107972,35 +131489,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 + LdsNumElements: 7680 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -108010,18 +131527,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108029,14 +131546,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -108081,25 +131598,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 684 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 + SolutionIndex: 831 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id027 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id039 ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id024 + VectorWidth: 2 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -108119,10 +131636,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -108130,47 +131647,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 16 + LSPB: 64 LVCA: 32 - LVCB: 16 + LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108178,20 +131691,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -108230,26 +131743,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 685 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 + SolutionIndex: 832 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id025 - WorkGroupMapping: 1 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -108270,56 +131783,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108327,15 +131840,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -108379,26 +131892,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 686 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SolutionIndex: 833 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -108411,13 +131924,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -108428,7 +131941,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -108436,39 +131949,35 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 8 - LVCB: 4 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108476,20 +131985,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -108528,25 +132037,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 687 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 834 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -108560,7 +132069,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -108577,7 +132086,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -108585,39 +132094,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 + LSPB: 128 + LVCA: 32 + LVCB: 2 LVPA: 2 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 3072 + LdsNumElementsAlignedB: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 3072 + LdsOffsetB_Blk: 11264 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108625,15 +132134,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -108677,25 +132186,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 688 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id021 + SolutionIndex: 835 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -108709,64 +132218,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108774,20 +132279,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -108826,26 +132331,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 689 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 836 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -108858,7 +132363,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -108866,56 +132371,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 3072 + LdsNumElementsAlignedB: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 3072 + LdsOffsetB_Blk: 11264 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108923,13 +132428,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -108975,26 +132480,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 690 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SolutionIndex: 837 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -109007,16 +132512,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -109024,47 +132529,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109072,20 +132573,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -109124,14 +132625,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 691 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 838 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 + ThreadTile: *id035 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -109142,8 +132643,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -109156,7 +132657,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -109173,7 +132674,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -109181,39 +132682,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 LSPB: 32 - LVCA: 8 - LVCB: 4 + LVCA: 16 + LVCB: 8 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109221,15 +132722,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -109273,14 +132774,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 692 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 + SolutionIndex: 839 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id035 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -109291,8 +132792,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -109305,64 +132806,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 4096 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109370,20 +132867,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -109422,26 +132919,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 693 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 840 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -109454,7 +132951,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -109462,56 +132959,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109519,14 +133016,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -109571,25 +133068,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 694 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 841 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -109603,13 +133100,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -109620,7 +133117,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -109628,39 +133125,35 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 + LSCA: 128 + LSCB: 32 LSPA: 8 - LSPB: 64 - LVCA: 16 - LVCB: 2 + LSPB: 32 + LVCA: 32 + LVCB: 8 LVPA: 2 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 4096 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109668,20 +133161,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -109720,14 +133213,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 695 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 842 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id027 + ThreadTile: *id033 ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 @@ -109738,7 +133231,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -109752,7 +133245,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -109769,7 +133262,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -109777,39 +133270,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 + LSCA: 128 + LSCB: 32 LSPA: 8 - LSPB: 64 - LVCA: 16 - LVCB: 2 + LSPB: 32 + LVCA: 32 + LVCB: 8 LVPA: 2 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109817,15 +133310,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -109869,14 +133362,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 696 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id027 + SolutionIndex: 843 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 @@ -109887,8 +133380,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -109907,58 +133400,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 32 - LVPA: 8 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109966,20 +133455,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -110018,26 +133507,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 697 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SolutionIndex: 844 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id035 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -110058,35 +133547,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6400 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -110096,18 +133585,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110115,14 +133604,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -110167,26 +133656,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 698 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 + SolutionIndex: 845 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id035 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -110205,7 +133694,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -110216,7 +133705,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -110233,30 +133722,26 @@ LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110264,20 +133749,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -110316,26 +133801,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 699 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 846 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id027 + ThreadTile: *id037 ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -110356,56 +133841,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 32 LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110413,15 +133898,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -110465,26 +133950,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 700 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SolutionIndex: 847 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -110503,58 +133988,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 4096 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110562,20 +134043,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -110614,26 +134095,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 701 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 848 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -110663,7 +134144,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -110671,39 +134152,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 32 - LSPA: 32 + LSPA: 8 LSPB: 32 - LVCA: 8 + LVCA: 32 LVCB: 8 - LVPA: 8 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110711,14 +134192,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -110763,96 +134244,96 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 702 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionIndex: 849 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id023 - WorkGroupMapping: 1 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 4 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 + KernelLanguage: Source + LSCA: 16 + LSCB: 4 + LSPA: 4 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110860,15 +134341,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -110912,96 +134393,96 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 703 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 + SolutionIndex: 850 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id023 + VectorWidth: 2 + WorkGroup: *id040 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 2 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 + KernelLanguage: Source + LSCA: 32 + LSCB: 2 + LSPA: 2 + LSPB: 32 LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 512 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111009,15 +134490,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111061,25 +134542,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 704 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 - SubGroup0: 16 + SolutionIndex: 851 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x02_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id022 + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id024 + VectorWidth: 4 + WorkGroup: *id040 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -111093,7 +134574,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -111101,7 +134582,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -111110,30 +134591,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -111146,11 +134627,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111160,11 +134641,11 @@ NonTemporalC: 0 NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -111210,26 +134691,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 705 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + SolutionIndex: 852 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: true + ThreadTile: *id041 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: *id044 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -111242,7 +134723,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 4 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -111250,56 +134731,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 + LSCA: 16 + LSCB: 4 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111307,15 +134788,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111359,26 +134840,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 706 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 853 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id027 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: *id042 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -111391,7 +134872,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -111399,8 +134880,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -111408,30 +134889,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -111444,11 +134925,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111456,15 +134937,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111508,26 +134989,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 707 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 854 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 + ThreadTile: *id041 + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 8 + WorkGroup: *id043 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -111540,7 +135021,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 4 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -111548,56 +135029,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 4 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111606,14 +135087,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111657,26 +135138,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 708 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SolutionIndex: 855 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id023 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: *id042 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -111689,7 +135170,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -111697,39 +135178,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -111742,11 +135223,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111754,15 +135235,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111806,26 +135287,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 709 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 856 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id023 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: *id043 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -111838,7 +135319,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -111846,39 +135327,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 LVCA: 16 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -111891,11 +135372,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111903,15 +135384,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111955,26 +135436,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 710 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_08_02 - SubGroup0: 16 + SolutionIndex: 857 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id024 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -111987,7 +135468,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -111995,56 +135476,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 + LSCA: 8 + LSCB: 8 LSPA: 8 - LSPB: 32 - LVCA: 32 + LSPB: 8 + LVCA: 8 LVCB: 8 - LVPA: 2 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112052,15 +135533,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -112104,26 +135585,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 711 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SolutionIndex: 858 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id027 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id024 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: *id043 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -112136,7 +135617,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -112144,7 +135625,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -112153,30 +135634,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -112189,11 +135670,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112203,11 +135684,11 @@ NonTemporalC: 0 NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -112253,25 +135734,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 712 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 859 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id018 + ThreadTile: *id041 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id026 + WorkGroup: *id044 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -112285,7 +135766,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -112293,7 +135774,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -112301,37 +135782,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 8 + LSPB: 32 LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 8 + LVCB: 8 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3360 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -112339,10 +135820,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112350,13 +135831,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -112402,25 +135883,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 713 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SolutionIndex: 860 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppresssNoLoadLoop: true + ThreadTile: *id045 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 + VectorWidth: 4 + WorkGroup: *id046 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -112434,7 +135915,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -112443,7 +135924,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -112451,36 +135932,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 32 - LVCA: 8 + LVCA: 32 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3360 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -112488,9 +135969,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -112499,8 +135980,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -112551,14 +136032,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 714 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 + SolutionIndex: 861 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 + SuppresssNoLoadLoop: true + ThreadTile: *id045 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -112569,8 +136050,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id023 - WorkGroupMapping: 1 + WorkGroup: *id046 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -112583,7 +136064,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -112591,56 +136072,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 64 LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112648,13 +136129,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -112700,26 +136181,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 715 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 862 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id045 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -112732,7 +136213,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -112749,7 +136230,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -112757,39 +136238,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112797,13 +136278,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -112849,26 +136330,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 716 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 + SolutionIndex: 863 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG32_08_01 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 + SuppresssNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id023 - WorkGroupMapping: 8 + WorkGroup: *id046 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -112881,64 +136362,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2592 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112946,21 +136423,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 8 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -112998,25 +136475,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 717 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 + SolutionIndex: 864 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x128x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG08_32_01 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 32 SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 + ThreadTile: *id047 + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id023 + VectorWidth: 1 + WorkGroup: [8, 32, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -113030,53 +136507,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 8 + LSCA: 128 + LSCB: 16 + LSPA: 2 LSPB: 16 - LVCA: 32 + LVCA: 128 LVCB: 16 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2592 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -113084,10 +136557,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113095,21 +136568,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -113147,25 +136620,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 718 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 - SubGroup0: 16 + SolutionIndex: 865 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW02_WG32_08_01 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id022 + ThreadTile: *id047 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id024 + WorkGroup: *id048 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -113179,15 +136652,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 - DirectToLds: false - DirectToLdsA: false + DepthU: 16 + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -113195,48 +136668,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 256 + LSCB: 16 + LSPA: 1 + LSPB: 16 + LVCA: 256 + LVCB: 16 + LVPA: 1 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 4096 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113244,21 +136713,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -113296,26 +136765,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 719 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 866 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x16_DTL1_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT08_04_USFGRO01_VW02_WG32_08_01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id050 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: *id048 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -113328,15 +136797,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -113344,48 +136813,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 + LSCA: 64 + LSCB: 16 + LSPA: 2 LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1600 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113393,21 +136858,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -113445,77 +136910,77 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 720 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SolutionIndex: 867 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG16_08_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id047 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 2112 LdsOffsetA: 0 LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -113526,11 +136991,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113538,21 +137003,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -113590,35 +137055,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 721 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 868 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: *id047 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: *id049 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -113629,7 +137094,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -113646,7 +137111,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 LSCB: 16 LSPA: 16 @@ -113656,11 +137121,11 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 2112 LdsOffsetA: 0 LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -113697,7 +137162,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -113735,35 +137200,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 722 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 869 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id029 + ThreadTile: *id047 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 1 + WorkGroup: *id049 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -113774,7 +137239,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -113791,25 +137256,21 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + KernelLanguage: Assembly + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3136 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -113821,9 +137282,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -113832,21 +137293,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -113884,79 +137345,77 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 723 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 870 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id029 - ThreadTile0: 4 + ThreadTile: *id050 + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 1 + WorkGroup: *id049 + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -113969,11 +137428,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113982,12 +137441,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 @@ -114033,75 +137490,74 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 724 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 871 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW01_GSU08_PGR1_PLR1_TT08_04_USFGRO01_VW01_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id031 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: *id051 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 + VectorWidth: 1 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 - DirectToLds: false - DirectToLdsA: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 - LSPA: 16 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 1 LSPB: 32 - LVCA: 16 + LVCA: 256 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 1 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2304 LdsOffsetA: 0 - LdsOffsetB: 1536 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114110,15 +137566,15 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -114126,14 +137582,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -114178,75 +137632,78 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 725 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id032 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SolutionIndex: 872 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT08_04_USFGRO01_VW04_WG32_08_01_WGM08 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: *id051 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id028 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id053 + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 128 - LSCB: 16 - LSPA: 8 + LSCB: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114259,7 +137716,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -114272,9 +137729,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -114284,7 +137739,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -114323,79 +137778,78 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 726 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 873 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW02_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id030 + ThreadTile: *id051 ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 + VectorWidth: 2 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 128 - LSCB: 16 - LSPA: 8 + LSCB: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114408,7 +137862,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -114421,9 +137875,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -114472,46 +137924,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 727 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id030 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SolutionIndex: 874 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW02_WG32_08_01_WGM01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: *id054 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 + VectorWidth: 2 + WorkGroup: *id053 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -114528,19 +137979,23 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCB: 8 + LSPA: 8 LSPB: 64 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 2 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114553,7 +138008,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -114565,20 +138020,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -114617,46 +138070,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 728 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 875 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id029 + SubGroupB: 8 + ThreadTile: *id054 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 8 + WorkGroup: *id055 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -114673,23 +138125,23 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCB: 8 + LSPA: 8 LSPB: 64 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 2 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114702,7 +138154,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -114714,15 +138166,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -114766,35 +138216,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 729 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 876 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM08 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id029 + SubGroupB: 8 + ThreadTile: *id054 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: *id055 WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -114804,8 +138254,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -114822,7 +138271,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 LSCB: 16 LSPA: 16 @@ -114832,13 +138281,9 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114865,8 +138310,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -114876,7 +138319,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -114915,35 +138358,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 730 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id031 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SolutionIndex: 877 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: *id051 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 8 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -114953,37 +138396,40 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + KernelLanguage: Assembly + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114997,10 +138443,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -115008,20 +138454,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -115060,35 +138504,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 731 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 + SolutionIndex: 878 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id032 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id028 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id052 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -115098,8 +138542,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -115116,23 +138559,19 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 + KernelLanguage: Assembly + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115146,10 +138585,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -115159,18 +138598,16 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -115209,46 +138646,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 732 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 879 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id030 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: *id054 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 8 + WorkGroup: *id052 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -115265,17 +138701,17 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3072 LdsOffsetA: 0 LdsOffsetB: 2048 LdsPadA: 0 @@ -115290,10 +138726,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -115302,14 +138738,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -115354,46 +138788,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 733 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 880 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id029 - ThreadTile0: 4 + ThreadTile: *id051 + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -115410,19 +138843,23 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115435,11 +138872,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -115449,18 +138886,16 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -115499,46 +138934,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 734 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 881 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id031 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: *id051 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -115555,19 +138989,23 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 16 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115580,7 +139018,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -115594,18 +139032,16 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -115644,46 +139080,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 735 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id030 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SolutionIndex: 882 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: *id054 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: *id053 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -115700,19 +139135,23 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115725,10 +139164,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -115737,20 +139176,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -115789,48 +139226,49 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 736 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id029 + SolutionIndex: 883 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM08 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: *id054 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: *id053 WorkGroupMapping: 8 WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -115843,25 +139281,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115874,10 +139312,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -115886,15 +139324,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -115938,46 +139379,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 737 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 884 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id029 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -115992,25 +139443,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 + LSCB: 16 LSPA: 8 - LSPB: 128 + LSPB: 64 LVCA: 32 - LVCB: 2 + LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -116023,7 +139474,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 128 MacroTileA: 128 @@ -116037,13 +139488,16 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -116087,14 +139541,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 738 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 885 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 @@ -116105,15 +139566,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -116126,9 +139590,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -116141,21 +139605,21 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 64 LVCA: 32 - LVCB: 2 + LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -116174,9 +139638,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116184,15 +139648,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -116236,33 +139703,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 739 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 886 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -116274,8 +139751,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -116290,21 +139767,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -116318,10 +139799,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116329,20 +139810,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -116381,33 +139865,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 740 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 887 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id035 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -116420,7 +139914,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -116435,25 +139929,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -116467,10 +139961,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116478,15 +139972,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -116530,81 +140027,95 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 741 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 888 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id035 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -116613,9 +140124,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116623,20 +140134,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -116675,48 +140189,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 742 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 889 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW2_GSU1_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id036 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -116729,27 +140253,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116760,11 +140284,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116774,13 +140298,16 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -116824,33 +140351,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 743 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 890 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id036 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -116863,57 +140400,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116921,15 +140458,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -116973,33 +140513,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 744 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id038 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 891 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 4, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -117011,8 +140561,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -117027,7 +140577,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 @@ -117039,11 +140589,15 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -117068,18 +140622,21 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -117118,14 +140675,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 745 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 892 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id037 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 @@ -117136,15 +140700,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -117157,9 +140724,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -117172,42 +140739,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117215,15 +140782,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -117267,17 +140837,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 746 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 893 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id037 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -117285,15 +140862,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -117306,57 +140886,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 2 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117364,15 +140944,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -117416,46 +140999,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 747 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 894 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id039 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -117470,27 +141063,31 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPB: 8 + LVCA: 8 + LVCB: 8 LVPA: 2 - LVPB: 16 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -117498,10 +141095,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117509,20 +141106,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -117561,46 +141161,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 748 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 895 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [8, 4, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -117615,31 +141225,31 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7296 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -117647,10 +141257,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117658,15 +141268,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -117710,33 +141323,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 749 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 896 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -117748,8 +141371,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -117764,23 +141387,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -117792,9 +141419,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -117803,20 +141430,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -117855,17 +141485,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 750 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 897 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id035 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -117873,15 +141510,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -117894,9 +141534,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -117909,42 +141549,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 + LVCB: 8 LVPA: 4 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117952,15 +141592,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -118004,14 +141647,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 751 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 898 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id035 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -118022,15 +141672,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -118041,9 +141694,9 @@ DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -118058,7 +141711,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 @@ -118070,26 +141723,30 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118097,20 +141754,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -118149,14 +141809,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 752 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 899 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id036 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 @@ -118167,15 +141834,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -118188,9 +141858,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -118203,42 +141873,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118246,15 +141916,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -118298,85 +141971,95 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 753 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 900 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id036 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -118384,10 +142067,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118395,15 +142078,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -118447,33 +142133,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 754 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 901 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id038 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -118485,8 +142181,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -118501,7 +142197,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 @@ -118513,11 +142209,15 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -118542,18 +142242,21 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -118592,14 +142295,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 755 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 902 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id037 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 @@ -118610,15 +142320,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -118631,9 +142344,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -118646,42 +142359,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118689,15 +142402,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -118741,17 +142457,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 756 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 903 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id037 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -118759,15 +142482,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -118780,57 +142506,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 2 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118838,15 +142564,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -118890,33 +142619,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 757 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 904 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id039 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -118928,10 +142667,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -118944,38 +142683,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118983,20 +142726,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -119035,46 +142781,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 758 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 905 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -119089,31 +142845,31 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7296 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -119121,10 +142877,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119132,15 +142888,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -119184,46 +142943,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 759 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 906 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -119238,23 +143007,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 + LSCB: 16 LSPA: 8 - LSPB: 128 + LSPB: 64 LVCA: 32 - LVCB: 2 + LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -119265,11 +143038,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119277,20 +143050,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -119329,48 +143105,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 760 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 907 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -119383,42 +143169,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 3072 - LdsNumElementsAlignedB: 3072 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 3072 - LdsOffsetB_Blk: 11264 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119426,15 +143212,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -119478,46 +143267,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 761 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 908 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -119532,38 +143331,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119571,20 +143374,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -119623,48 +143429,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 762 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 909 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -119677,25 +143493,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 64 LVCA: 32 - LVCB: 2 + LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 3072 - LdsNumElementsAlignedB: 3072 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 3072 - LdsOffsetB_Blk: 11264 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -119708,11 +143524,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119720,15 +143536,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -119749,6 +143568,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119758,6 +143578,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119772,48 +143593,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 763 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 910 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -119826,21 +143657,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -119853,10 +143688,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -119865,20 +143700,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -119894,6 +143732,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119903,6 +143742,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119917,48 +143757,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 764 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id035 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 911 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -119971,21 +143821,21 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCA: 256 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 6400 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -120002,11 +143852,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120014,15 +143864,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -120043,6 +143896,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -120052,6 +143906,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -120066,48 +143921,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 765 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id035 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 912 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x32x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG64_4_1_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 64 + SubGroup1: 4 + SubGroupA: 64 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -120120,21 +143985,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 8 LSPA: 8 LSPB: 32 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -120147,11 +144016,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120159,20 +144028,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -120188,6 +144060,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -120197,6 +144070,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -120211,17 +144085,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 766 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id037 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 913 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS1_PGR1_SNLL1_TT4_4_WG32_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -120229,30 +144110,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -120265,25 +144149,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -120296,7 +144180,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -120310,13 +144194,16 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -120337,6 +144224,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -120346,6 +144234,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -120360,48 +144249,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 767 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id037 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 914 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM7 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 7 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -120414,21 +144313,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -120441,11 +144344,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120453,20 +144356,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -120482,6 +144388,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -120491,6 +144398,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -120505,17 +144413,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 768 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 915 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM15 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -120523,30 +144438,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 15 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -120559,25 +144477,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -120590,11 +144508,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120602,15 +144520,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -120631,6 +144552,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -120640,6 +144562,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -120654,48 +144577,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 769 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 916 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -120708,21 +144641,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -120735,10 +144672,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -120747,20 +144684,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -120776,6 +144716,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -120785,6 +144726,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -120799,17 +144741,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 770 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 917 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM17 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id035 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -120817,30 +144766,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 17 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -120853,25 +144805,21 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 + LSCA: 256 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -120884,11 +144832,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120898,18 +144846,21 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -120925,6 +144876,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -120934,6 +144886,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -120948,14 +144901,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 771 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id035 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 918 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM17 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 64 + SubGroup1: 4 + SubGroupA: 64 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -120966,30 +144926,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 17 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -121002,21 +144965,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -121029,7 +144996,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -121043,18 +145010,21 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -121070,6 +145040,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -121079,6 +145050,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -121093,48 +145065,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 772 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id037 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 919 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -121147,25 +145129,21 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 + LSCA: 256 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -121178,11 +145156,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121190,20 +145168,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -121219,6 +145200,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -121228,6 +145210,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -121242,17 +145225,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 773 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id037 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 920 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 64 + SubGroup1: 4 + SubGroupA: 64 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -121260,28 +145250,31 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -121296,21 +145289,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -121323,11 +145320,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121335,20 +145332,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -121364,6 +145364,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -121373,6 +145374,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -121387,46 +145389,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 774 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 921 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -121441,42 +145453,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121484,15 +145496,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -121513,6 +145528,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -121522,6 +145538,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -121536,17 +145553,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 775 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 922 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -121554,61 +145578,64 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 4 - LSPA: 4 - LSPB: 16 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 LVCA: 16 LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -121621,11 +145648,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121633,15 +145660,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -121662,6 +145692,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -121671,6 +145702,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -121685,48 +145717,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 776 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 923 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_16_1_WGM7 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id040 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 7 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 2 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -121738,43 +145780,43 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 2 - LSPA: 2 - LSPB: 32 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121784,13 +145826,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -121811,6 +145856,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -121820,6 +145866,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -121834,33 +145881,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 777 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x02_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 924 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM7 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id040 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 7 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -121873,57 +145930,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121931,15 +145988,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -121960,6 +146020,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -121969,6 +146030,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -121983,79 +146045,89 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 778 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 925 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM11 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id044 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 11 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 4 - LSPA: 4 - LSPB: 16 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 LVCA: 16 LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -122068,11 +146140,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122080,15 +146152,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -122109,6 +146184,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -122118,6 +146194,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -122132,96 +146209,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 779 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 926 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id042 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122229,15 +146316,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -122258,6 +146348,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -122267,6 +146358,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -122279,98 +146371,108 @@ TotalIndices: 4 TransposeA: false TransposeB: false - UseBeta: true - UseInitialStrides: false - SolutionIndex: 780 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 927 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id043 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 4 - LSPA: 4 - LSPB: 16 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 LVCA: 16 - LVCB: 4 + LVCB: 8 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122378,15 +146480,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -122407,6 +146512,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -122416,6 +146522,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -122430,96 +146537,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 781 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 928 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_8_2_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id042 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122527,15 +146644,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -122556,6 +146676,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -122565,6 +146686,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -122579,33 +146701,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 782 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 929 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id043 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -122618,57 +146750,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122676,15 +146808,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -122705,6 +146840,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -122714,6 +146850,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -122728,96 +146865,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 783 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 930 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122825,15 +146972,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -122854,6 +147004,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -122863,6 +147014,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -122877,33 +147029,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 784 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 931 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id043 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -122916,57 +147078,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122974,15 +147136,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -123003,6 +147168,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -123012,6 +147178,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -123026,48 +147193,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 785 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 932 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id044 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123084,23 +147261,23 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 32 + LSCB: 16 + LSPA: 16 + LSPB: 128 LVCA: 32 - LVCB: 8 - LVPA: 2 + LVCB: 4 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3360 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -123111,11 +147288,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123123,15 +147300,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -123152,6 +147332,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -123161,6 +147342,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -123175,17 +147357,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 786 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id045 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 933 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_32_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 32 + SubGroupA: 16 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -123193,30 +147382,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id046 + WorkGroup: [16, 32, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123232,24 +147424,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3360 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -123260,11 +147452,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123274,13 +147466,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -123301,6 +147496,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -123310,6 +147506,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -123324,14 +147521,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 787 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id045 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 934 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -123342,21 +147546,24 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id046 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -123365,7 +147572,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123378,27 +147585,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -123409,10 +147616,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -123421,8 +147628,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -123430,6 +147637,11 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -123441,6 +147653,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -123450,6 +147663,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -123459,6 +147673,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -123473,20 +147688,27 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 788 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 935 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id045 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -123494,27 +147716,28 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123527,27 +147750,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -123558,7 +147781,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -123572,13 +147795,16 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -123590,6 +147816,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -123599,6 +147826,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -123608,6 +147836,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -123622,77 +147851,91 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 789 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG32_08_01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 936 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id046 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 16 + LSPB: 64 LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2592 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -123703,11 +147946,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123715,26 +147958,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -123744,6 +147993,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -123753,6 +148003,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -123767,77 +148018,89 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 790 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x128x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG08_32_01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppresssNoLoadLoop: false - ThreadTile: *id047 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 937 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 32, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 2 - LSPB: 16 - LVCA: 128 - LVCB: 16 + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2592 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -123848,11 +148111,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123860,26 +148123,30 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -123889,6 +148156,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -123898,6 +148166,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -123912,92 +148181,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 791 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW02_WG32_08_01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id047 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 938 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id048 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 16 - DirectToLds: true - DirectToLdsA: true + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 + LSCA: 128 LSCB: 16 - LSPA: 1 - LSPB: 16 - LVCA: 256 - LVCB: 16 - LVPA: 1 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4640 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 256 - MacroTile1: 32 - MacroTileA: 256 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124005,26 +148288,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124034,6 +148323,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -124043,6 +148333,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -124057,33 +148348,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 792 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x16_DTL1_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT08_04_USFGRO01_VW02_WG32_08_01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id050 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 939 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id048 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -124095,39 +148394,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1600 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -124139,10 +148442,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124150,26 +148453,30 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124179,6 +148486,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -124188,6 +148496,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -124202,33 +148511,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 793 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 940 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id047 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -124240,39 +148559,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -124284,10 +148607,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124295,26 +148618,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124324,6 +148653,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -124333,6 +148663,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -124347,33 +148678,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 794 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 941 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id047 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id049 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -124385,8 +148724,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -124401,23 +148740,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -124429,10 +148772,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124440,26 +148783,30 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124469,6 +148816,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -124478,6 +148826,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -124492,77 +148841,91 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 795 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 942 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id047 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id049 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 32 + LSCB: 8 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 3344 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -124573,10 +148936,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -124585,26 +148948,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124614,6 +148983,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -124623,6 +148993,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -124637,44 +149008,54 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 796 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 943 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id050 - ThreadTile0: 8 + SuppressNoLoadLoop: true + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id049 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -124684,32 +149065,28 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 2 - LSPB: 32 - LVCA: 128 - LVCB: 8 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -124720,10 +149097,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -124732,24 +149109,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124759,6 +149144,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -124768,6 +149154,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -124782,45 +149169,54 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 797 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW01_GSU08_PGR1_PLR1_TT08_04_USFGRO01_VW01_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 944 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id051 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id052 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: true - DirectToLdsA: true + DepthU: 16 + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -124838,35 +149234,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 - LSCB: 8 - LSPA: 1 - LSPB: 32 - LVCA: 256 - LVCB: 8 - LVPA: 1 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 256 - MacroTile1: 32 - MacroTileA: 256 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124874,13 +149270,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -124892,6 +149295,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124901,6 +149305,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -124910,6 +149315,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -124924,16 +149330,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 798 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT08_04_USFGRO01_VW04_WG32_08_01_WGM08 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id051 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 945 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -124941,16 +149355,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id053 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -124962,6 +149376,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -124977,27 +149392,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 96 LSCB: 8 - LSPA: 4 + LSPA: 5 LSPB: 64 - LVCA: 64 + LVCA: 48 LVCB: 4 - LVPA: 2 + LVPA: 3 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3344 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125009,9 +149424,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -125020,13 +149435,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -125038,6 +149458,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125047,6 +149468,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -125056,6 +149478,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -125070,45 +149493,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 799 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW02_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 946 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id051 - ThreadTile0: 8 + SuppressNoLoadLoop: true + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id052 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -125123,27 +149557,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6688 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125154,10 +149588,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -125166,24 +149600,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125193,6 +149635,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -125202,6 +149645,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -125216,80 +149660,89 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 800 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW02_WG32_08_01_WGM01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id054 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 947 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id053 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 + LSCB: 16 LSPA: 8 - LSPB: 64 - LVCA: 16 - LVCB: 2 - LVPA: 2 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6688 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1600 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125300,11 +149753,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125312,24 +149765,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125339,6 +149800,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -125348,6 +149810,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -125362,80 +149825,89 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 801 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 948 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id054 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id055 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 + LSCB: 16 LSPA: 8 - LSPB: 64 - LVCA: 16 - LVCB: 2 - LVPA: 2 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6688 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1600 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125446,11 +149918,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125458,24 +149930,30 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125485,6 +149963,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -125494,6 +149973,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -125508,47 +149988,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 802 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM08 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 949 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id054 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id055 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -125561,23 +150052,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125588,11 +150083,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125602,22 +150097,30 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125627,6 +150130,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -125636,6 +150140,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -125650,47 +150155,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 803 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id051 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 950 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -125703,27 +150217,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125734,10 +150248,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -125746,13 +150260,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -125764,6 +150283,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125773,6 +150293,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -125782,6 +150303,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -125796,47 +150318,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 804 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 951 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id052 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -125849,23 +150382,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125876,11 +150413,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125890,22 +150427,30 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125915,6 +150460,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -125924,6 +150470,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -125938,47 +150485,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 805 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 952 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id054 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id052 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -125991,23 +150547,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -126018,7 +150578,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -126034,20 +150594,26 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126057,6 +150623,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -126066,6 +150633,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -126080,33 +150648,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 806 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 953 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id051 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id052 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -126118,7 +150696,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -126133,27 +150712,23 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -126165,9 +150740,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -126176,24 +150751,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126203,6 +150786,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -126212,6 +150796,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -126226,33 +150811,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 807 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 954 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id051 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id052 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -126264,7 +150857,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -126279,7 +150873,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 @@ -126291,15 +150885,15 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 7232 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -126324,11 +150918,18 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -126340,6 +150941,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126349,6 +150951,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -126358,6 +150961,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -126372,33 +150976,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 808 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id054 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 955 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id053 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -126410,6 +151022,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -126425,7 +151038,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 @@ -126437,15 +151050,15 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 7232 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -126475,154 +151088,6 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesSummation: 1 - OperationType: GEMM - SilentHighPrecisionAccumulate: false - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: false - UseBeta: true - UseInitialStrides: false - SolutionIndex: 809 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM08 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id054 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 - UnrollMemFence: false - UseSgprForGRO: false - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 4 - WorkGroup: *id053 - WorkGroupMapping: 8 - WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: false - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: true - InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 - LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 - LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 1 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MinGlobalWriteVectorWidth: 1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -126639,6 +151104,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126648,6 +151114,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -126657,6 +151124,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -126675,8 +151143,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 810 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + SolutionIndex: 956 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -126691,15 +151159,15 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -126719,8 +151187,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -126738,24 +151206,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -126767,10 +151231,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126778,12 +151242,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -126794,13 +151260,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126810,6 +151277,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -126819,6 +151287,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -126837,8 +151306,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 811 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM4 + SolutionIndex: 957 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -126846,24 +151315,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126875,16 +151342,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -126901,23 +151368,23 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 + LSCB: 16 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -126928,7 +151395,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -126942,9 +151409,11 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -126963,6 +151432,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126972,6 +151442,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -126981,6 +151452,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -126999,8 +151471,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 812 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 958 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -127015,7 +151487,7 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -127023,9 +151495,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127071,15 +151541,15 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7232 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -127092,9 +151562,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127102,12 +151572,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -127125,6 +151595,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127134,6 +151605,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -127143,6 +151615,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -127161,8 +151634,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 813 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 959 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -127171,13 +151644,13 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -127199,47 +151672,43 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -127252,11 +151721,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127264,13 +151733,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -127280,13 +151751,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127296,6 +151768,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -127305,6 +151778,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -127323,33 +151797,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 814 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM32 + SolutionIndex: 960 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127361,64 +151833,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 + LSCB: 8 + LSPA: 2 LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127427,12 +151895,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -127442,13 +151912,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127458,6 +151929,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -127467,6 +151939,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -127485,8 +151958,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 815 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW2_GSU1_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_2_WGM4 + SolutionIndex: 961 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -127494,24 +151967,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 4 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127529,43 +152000,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -127577,10 +152044,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127588,13 +152055,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -127604,13 +152073,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127620,6 +152090,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -127629,6 +152100,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -127647,33 +152119,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 816 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + SolutionIndex: 962 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127691,57 +152161,53 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 8 - LVCB: 4 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 LVPA: 2 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -127751,12 +152217,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -127766,13 +152234,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127782,6 +152251,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -127791,6 +152261,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -127809,33 +152280,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 817 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 + SolutionIndex: 963 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 4, 2] - WorkGroupMapping: 4 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127853,43 +152322,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 LVPA: 2 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -127901,10 +152366,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127912,13 +152377,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -127928,13 +152395,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127944,6 +152412,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -127953,6 +152422,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -127971,33 +152441,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 818 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + SolutionIndex: 964 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128009,14 +152477,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -128035,27 +152503,27 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCB: 8 + LSPA: 8 LSPB: 32 LVCA: 16 - LVCB: 8 - LVPA: 4 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -128074,13 +152542,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -128097,6 +152567,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128106,6 +152577,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -128115,6 +152587,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -128133,8 +152606,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 819 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 + SolutionIndex: 965 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -128154,12 +152627,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128171,7 +152642,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -128180,7 +152651,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -128197,27 +152668,27 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 16 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -128226,9 +152697,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128242,7 +152713,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -128259,6 +152730,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128268,6 +152740,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -128277,6 +152750,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -128295,8 +152769,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 820 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM4 + SolutionIndex: 966 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -128305,21 +152779,21 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -128333,14 +152807,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -128358,39 +152832,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 8 LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 LVPA: 2 - LVPB: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 640 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128398,13 +152872,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -128421,6 +152897,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128430,6 +152907,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -128439,6 +152917,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -128457,33 +152936,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 821 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 + SolutionIndex: 967 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 4, 2] - WorkGroupMapping: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128495,7 +152972,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -128504,7 +152981,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -128521,34 +152998,34 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCB: 8 + LSPA: 8 LSPB: 32 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -128560,13 +153037,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -128583,6 +153060,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128592,6 +153070,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -128601,6 +153080,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -128619,8 +153099,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 822 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 + SolutionIndex: 968 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -128640,10 +153120,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -128663,8 +153143,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -128682,24 +153162,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 LSPA: 8 - LSPB: 64 - LVCA: 32 + LSPB: 32 + LVCA: 16 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -128711,10 +153187,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128722,13 +153198,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -128738,13 +153216,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128754,6 +153233,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -128763,6 +153243,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -128781,20 +153262,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 823 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 969 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -128802,12 +153283,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128825,10 +153304,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -128846,33 +153325,29 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 32 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -128884,13 +153359,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -128900,13 +153375,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128916,6 +153392,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -128925,6 +153402,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -128943,8 +153421,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 824 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 970 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL1_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -128952,7 +153430,7 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -128964,7 +153442,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -128981,53 +153459,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1032 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -129047,11 +153521,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -129062,13 +153538,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129078,6 +153555,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -129087,6 +153565,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -129105,33 +153584,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 825 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM8 + SolutionIndex: 971 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129143,53 +153620,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 + LSCA: 64 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 32 + LVCA: 64 LVCB: 8 - LVPA: 2 - LVPB: 16 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1032 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -129197,10 +153670,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129209,11 +153682,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -129224,13 +153699,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129240,6 +153716,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -129249,6 +153726,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -129267,33 +153745,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 826 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 972 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129305,64 +153781,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 16 + LVCA: 64 LVCB: 8 LVPA: 4 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3080 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129370,12 +153846,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -129393,6 +153871,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129402,6 +153881,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -129411,6 +153891,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -129429,15 +153910,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 827 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 973 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -129445,17 +153926,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129467,49 +153946,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 776 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -129520,11 +153995,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129532,13 +154007,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129548,13 +154025,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129564,6 +154042,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -129573,6 +154052,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -129591,33 +154071,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 828 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 974 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129629,53 +154107,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 LVCB: 8 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 648 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -129684,9 +154158,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129695,12 +154169,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129710,13 +154186,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129726,6 +154203,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -129735,6 +154213,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -129753,8 +154232,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 829 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 975 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -129762,24 +154241,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129791,53 +154268,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 648 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -129846,9 +154319,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129856,13 +154329,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129872,13 +154347,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129888,6 +154364,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -129897,6 +154374,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -129915,8 +154393,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 830 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 + SolutionIndex: 976 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -129924,24 +154402,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129953,53 +154429,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 + LSCA: 64 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 32 + LVCA: 64 LVCB: 8 - LVPA: 2 - LVPB: 16 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3080 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -130007,10 +154483,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130019,11 +154495,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -130041,6 +154519,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130050,6 +154529,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -130059,6 +154539,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -130077,33 +154558,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 831 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 977 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130115,60 +154594,56 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 776 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -130180,13 +154655,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -130196,13 +154673,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130212,6 +154690,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -130221,6 +154700,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -130239,8 +154719,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 832 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 978 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -130248,24 +154728,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130277,49 +154755,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 648 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130330,11 +154804,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130342,13 +154816,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -130358,13 +154834,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130374,6 +154851,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -130383,6 +154861,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -130401,33 +154880,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 833 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM32 + SolutionIndex: 979 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130445,58 +154922,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2064 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130504,12 +154977,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -130520,13 +154995,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130536,6 +155012,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -130545,6 +155022,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -130563,33 +155041,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 834 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 + SolutionIndex: 980 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 32 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130601,53 +155077,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 1552 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -130667,11 +155139,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -130682,13 +155156,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130698,6 +155173,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -130707,6 +155183,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -130725,33 +155202,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 835 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 + SolutionIndex: 981 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 32 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130763,49 +155238,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1552 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130816,11 +155287,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130828,12 +155299,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -130844,13 +155317,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130889,8 +155363,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 836 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM2 + SolutionIndex: 982 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -130898,24 +155372,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 2 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130927,49 +155399,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 LVPA: 2 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1552 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130980,11 +155448,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130992,13 +155460,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -131008,13 +155478,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131053,33 +155524,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 837 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM2 + SolutionIndex: 983 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131091,49 +155560,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 - LSCB: 8 - LSPA: 4 - LSPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 LVCA: 64 - LVCB: 4 - LVPA: 1 - LVPB: 16 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1296 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -131144,11 +155609,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 256 - MacroTile1: 32 - MacroTileA: 256 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131156,13 +155621,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -131172,13 +155639,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131217,33 +155685,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 838 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x32x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG64_4_1_WGM2 + SolutionIndex: 984 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 64 - SubGroup1: 4 - SubGroupA: 64 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131255,49 +155721,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 4 + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 LVPA: 2 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1296 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -131308,11 +155770,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131320,13 +155782,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -131336,13 +155800,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131381,33 +155846,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 839 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS1_PGR1_SNLL1_TT4_4_WG32_8_1_WGM4 + SolutionIndex: 985 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131419,49 +155882,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2064 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -131472,10 +155931,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -131484,12 +155943,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -131500,13 +155961,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131545,33 +156007,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 840 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM7 + SolutionIndex: 986 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 7 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131583,49 +156043,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1552 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -131636,11 +156092,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131648,12 +156104,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -131664,13 +156122,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131709,33 +156168,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 841 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM15 + SolutionIndex: 987 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 15 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131747,49 +156204,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 LVPA: 2 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1296 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -131800,11 +156253,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131812,13 +156265,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -131828,13 +156283,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131873,33 +156329,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 842 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM16 + SolutionIndex: 988 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131911,49 +156365,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3104 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -131964,11 +156414,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131976,12 +156426,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -131992,13 +156444,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132037,8 +156490,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 843 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM17 + SolutionIndex: 989 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -132046,24 +156499,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 17 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132082,38 +156533,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 + LSCA: 64 LSCB: 8 - LSPA: 4 + LSPA: 2 LSPB: 16 LVCA: 64 - LVCB: 4 - LVPA: 1 - LVPB: 8 + LVCB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2176 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -132125,10 +156576,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 256 - MacroTile1: 16 - MacroTileA: 256 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132138,11 +156589,13 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -132159,6 +156612,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132197,15 +156651,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 844 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM17 + SolutionIndex: 990 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 64 - SubGroup1: 4 - SubGroupA: 64 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -132213,17 +156667,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 17 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132241,43 +156693,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -132289,10 +156737,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132300,13 +156748,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -132316,13 +156766,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132361,33 +156812,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 845 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM32 + SolutionIndex: 991 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132399,45 +156848,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 - LSCB: 8 - LSPA: 4 - LSPB: 16 + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 LVCA: 64 - LVCB: 4 - LVPA: 1 + LVCB: 16 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2176 + LdsNumElements: 1600 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -132448,11 +156897,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 256 - MacroTile1: 16 - MacroTileA: 256 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132462,11 +156911,13 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -132483,6 +156934,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132521,15 +156973,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 846 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM32 + SolutionIndex: 992 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 64 - SubGroup1: 4 - SubGroupA: 64 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -132537,17 +156989,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132559,16 +157009,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -132585,23 +157035,19 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 16 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -132612,11 +157058,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132626,11 +157072,13 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -132640,13 +157088,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132685,16 +157134,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 847 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM2 + SolutionIndex: 993 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -132706,12 +157155,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132723,16 +157170,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -132749,27 +157196,23 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 16 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -132778,9 +157221,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132794,7 +157237,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -132804,13 +157247,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132849,8 +157293,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 848 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM2 + SolutionIndex: 994 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -132858,22 +157302,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -132887,47 +157331,43 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 16 LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -132940,11 +157380,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132952,12 +157392,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -132968,7 +157408,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -133013,8 +157453,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 849 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_16_1_WGM7 + SolutionIndex: 995 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -133022,22 +157462,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 7 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -133051,64 +157491,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 8 LVCB: 4 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 128 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 2 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133116,8 +157552,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -133132,8 +157568,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -133177,31 +157613,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 850 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM7 + SolutionIndex: 996 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 7 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -133221,58 +157657,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133280,8 +157712,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -133296,8 +157728,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -133341,29 +157773,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 851 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM11 + SolutionIndex: 997 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 11 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -133385,41 +157817,37 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -133433,10 +157861,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133444,8 +157872,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -133460,7 +157888,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -133505,8 +157933,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 852 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM16 + SolutionIndex: 998 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -133514,20 +157942,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -133549,41 +157977,37 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -133597,10 +158021,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133608,13 +158032,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -133624,8 +158048,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -133669,29 +158093,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 853 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 + SolutionIndex: 999 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 32 + VectorWidth: 2 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -133713,9 +158137,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -133723,31 +158147,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 16 + LSPB: 16 + LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -133761,10 +158181,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133772,13 +158192,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -133788,7 +158208,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -133833,29 +158253,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 854 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_8_2_WGM64 + SolutionIndex: 1000 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -133871,7 +158291,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -133879,56 +158299,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 16 + LSPB: 16 + LVCA: 8 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133936,12 +158356,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -133997,31 +158417,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 855 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 + SolutionIndex: 1001 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 32 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -134035,15 +158455,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -134051,48 +158471,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134100,8 +158516,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -134116,7 +158532,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -134161,31 +158577,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 856 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM8 + SolutionIndex: 1002 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -134205,41 +158621,37 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -134253,10 +158665,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134264,8 +158676,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -134280,8 +158692,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -134325,8 +158737,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 857 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM1 + SolutionIndex: 1003 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -134334,20 +158746,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -134369,41 +158781,37 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -134417,9 +158825,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -134428,12 +158836,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -134444,8 +158852,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -134489,8 +158897,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 858 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM1 + SolutionIndex: 1004 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -134498,20 +158906,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -134533,41 +158941,37 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 128 - LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -134581,10 +158985,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134592,13 +158996,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -134608,7 +159012,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -134653,29 +159057,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 859 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_32_1_WGM1 + SolutionIndex: 1005 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 32, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -134697,58 +159101,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134756,8 +159156,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -134772,8 +159172,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -134817,28 +159217,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 860 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM8 + SolutionIndex: 1006 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -134855,15 +159255,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -134871,37 +159271,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -134909,10 +159305,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134920,10 +159316,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -134938,14 +159332,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -134984,31 +159377,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 861 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1007 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -135020,7 +159415,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -135028,7 +159423,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -135036,23 +159431,23 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 @@ -135066,7 +159461,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -135074,10 +159469,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135085,11 +159480,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -135108,7 +159503,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -135147,31 +159541,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 862 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1008 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -135185,15 +159579,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -135201,37 +159595,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -135239,10 +159629,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135250,15 +159640,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -135268,14 +159656,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -135314,31 +159701,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 863 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1009 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -135350,7 +159739,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -135358,7 +159747,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -135366,37 +159755,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -135404,10 +159793,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135415,13 +159804,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -135438,7 +159827,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -135477,31 +159865,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 864 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1010 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -135515,64 +159903,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135580,15 +159964,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -135598,14 +159980,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -135644,31 +160025,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 865 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1011 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -135680,7 +160063,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -135688,56 +160071,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135745,13 +160128,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -135768,7 +160151,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -135807,31 +160189,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 866 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1012 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB0_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -135851,43 +160233,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1056 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -135899,10 +160277,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135910,14 +160288,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -135928,14 +160304,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -135974,8 +160349,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 867 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1013 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -135983,22 +160358,24 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -136016,43 +160393,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1568 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -136064,10 +160437,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136075,11 +160448,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -136091,14 +160464,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -136137,8 +160509,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 868 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1014 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -136146,20 +160518,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -136175,47 +160547,43 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 8 - LSPA: 8 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3344 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1568 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -136228,10 +160596,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 96 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -136240,11 +160608,9 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 @@ -136258,14 +160624,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -136304,8 +160669,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 869 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1015 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -136313,14 +160678,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [6, 4] - ThreadTile0: 6 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -136328,7 +160693,9 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -136347,53 +160714,53 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 - LSPA: 4 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 544 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136401,15 +160768,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -136426,7 +160791,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -136465,31 +160829,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 870 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1016 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -136508,53 +160874,53 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 - LSPA: 4 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136562,14 +160928,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -136587,7 +160951,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -136626,31 +160989,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 871 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 1017 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -136662,7 +161027,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -136687,39 +161052,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 96 - LSCB: 8 - LSPA: 5 - LSPB: 64 - LVCA: 48 - LVCB: 4 - LVPA: 3 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3344 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136727,11 +161092,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -136750,7 +161115,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -136789,32 +161153,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 872 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1018 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -136827,14 +161191,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -136853,21 +161217,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6688 - LdsNumElementsAlignedA: 1536 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -136880,11 +161244,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136892,13 +161256,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -136911,13 +161273,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -136956,8 +161317,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 873 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1019 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -136966,21 +161327,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -136992,14 +161355,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -137012,44 +161375,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6688 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1600 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137057,14 +161420,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -137082,7 +161443,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -137121,31 +161481,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 874 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1020 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -137157,13 +161519,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -137177,44 +161539,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6688 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1600 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137222,12 +161580,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -137238,14 +161596,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -137284,31 +161641,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 875 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1021 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -137322,15 +161679,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -137338,37 +161695,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -137376,10 +161733,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137387,10 +161744,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -137412,7 +161767,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -137451,31 +161805,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 876 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1022 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -137487,7 +161843,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -137495,7 +161851,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -137503,25 +161859,25 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3616 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 @@ -137529,11 +161885,11 @@ LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -137541,10 +161897,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137552,11 +161908,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -137569,13 +161925,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -137614,31 +161969,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 877 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1023 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -137652,15 +162007,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -137668,37 +162023,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -137706,10 +162057,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137717,13 +162068,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -137735,14 +162084,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -137781,31 +162129,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 878 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1024 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -137817,7 +162167,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -137825,7 +162175,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -137833,25 +162183,25 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3616 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 @@ -137859,11 +162209,11 @@ LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -137871,10 +162221,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137882,11 +162232,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -137905,7 +162255,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -137944,31 +162293,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 879 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1025 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -137982,60 +162331,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138043,14 +162396,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -138061,14 +162412,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -138107,31 +162457,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 880 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1026 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -138143,64 +162495,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138208,14 +162556,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -138226,14 +162572,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -138272,31 +162617,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 881 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1027 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -138308,7 +162655,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -138316,56 +162663,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138373,12 +162720,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -138396,7 +162743,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -138435,31 +162781,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 882 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1028 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -138473,60 +162819,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138534,15 +162884,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -138552,14 +162900,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -138598,31 +162945,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 883 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1029 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -138634,39 +162983,39 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 8 - LSPB: 64 + LSPB: 16 LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 7232 LdsNumElementsAlignedA: 2048 @@ -138676,7 +163025,7 @@ LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -138687,11 +163036,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138699,14 +163048,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -138724,7 +163071,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -138763,8 +163109,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 884 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1030 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -138773,21 +163119,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -138799,64 +163147,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138864,11 +163208,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -138880,14 +163224,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -138926,31 +163269,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 885 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1031 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -138964,49 +163307,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -139014,10 +163361,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -139025,15 +163372,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -139043,14 +163388,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -139089,31 +163433,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 886 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1032 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -139125,49 +163471,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 32 + LSCB: 32 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -139175,9 +163521,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -139186,15 +163532,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -139205,13 +163549,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -139250,14 +163593,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 887 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1033 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 2] @@ -139266,15 +163609,17 @@ ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -139286,49 +163631,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 32 + LSCB: 32 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -139336,9 +163685,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -139347,15 +163696,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -139365,14 +163712,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -139411,31 +163757,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 888 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1034 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -139447,60 +163795,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 LVCB: 16 - LVPA: 2 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -139508,15 +163860,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -139526,14 +163876,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -139572,31 +163921,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 889 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1035 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -139608,60 +163959,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 LVCB: 16 - LVPA: 2 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -139669,15 +164024,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -139687,14 +164040,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -139733,31 +164085,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 890 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1036 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -139769,15 +164123,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -139785,37 +164139,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 832 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -139823,10 +164173,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -139834,13 +164184,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -139852,14 +164200,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -139898,31 +164245,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 891 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 1037 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -139934,7 +164283,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -139942,7 +164291,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -139950,37 +164299,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 1856 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -139988,10 +164337,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -139999,11 +164348,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -140022,7 +164371,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -140061,31 +164409,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 892 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 1038 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -140099,64 +164447,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 16 - LVCB: 2 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3136 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -140164,14 +164512,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -140189,7 +164535,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -140228,31 +164573,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 893 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 1039 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -140264,7 +164611,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -140272,7 +164619,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -140280,37 +164627,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 3136 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -140318,10 +164665,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -140329,13 +164676,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -140346,13 +164693,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -140391,31 +164737,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 894 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 1040 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -140429,60 +164775,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -140490,15 +164840,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -140508,14 +164856,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -140554,31 +164901,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 895 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 1041 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -140590,59 +164939,63 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -140651,13 +165004,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -140667,14 +165020,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -140713,31 +165065,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 896 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL1_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 1042 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -140751,49 +165103,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1032 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -140801,10 +165157,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -140812,15 +165168,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -140830,14 +165184,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -140876,31 +165229,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 897 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1043 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -140912,49 +165267,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1032 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -140962,10 +165321,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -140973,15 +165332,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -140991,14 +165348,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -141037,31 +165393,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 898 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1044 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -141073,41 +165431,41 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3080 + LdsNumElements: 3136 LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 @@ -141115,11 +165473,11 @@ LdsOffsetB: 512 LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -141127,10 +165485,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141138,15 +165496,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -141163,7 +165519,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -141202,31 +165557,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 899 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1045 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -141238,49 +165595,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 776 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -141288,10 +165649,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141299,15 +165660,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -141317,14 +165676,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -141363,31 +165721,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 900 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1046 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -141399,49 +165759,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 648 + LdsNumElements: 1088 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -141449,9 +165809,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -141460,15 +165820,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -141479,13 +165837,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -141524,31 +165881,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 901 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1047 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -141560,49 +165919,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 648 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -141610,9 +165973,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -141621,15 +165984,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -141639,14 +166000,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -141685,31 +166045,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 902 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1048 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -141721,53 +166083,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3080 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -141775,10 +166137,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141786,14 +166148,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -141805,13 +166165,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -141850,31 +166209,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 903 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1049 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -141886,49 +166247,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 32 + LSCB: 32 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 776 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -141936,10 +166301,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141947,15 +166312,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -141965,14 +166328,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -142011,31 +166373,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 904 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1050 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -142047,49 +166411,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 648 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -142097,10 +166465,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142108,15 +166476,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142126,14 +166492,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -142172,31 +166537,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 905 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1051 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -142208,60 +166575,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2064 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142269,15 +166640,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142287,14 +166656,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -142333,31 +166701,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 906 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1052 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -142369,60 +166739,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1552 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142430,15 +166804,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142448,14 +166820,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -142494,31 +166865,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 907 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1053 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -142537,38 +166910,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 - LSPA: 4 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1552 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -142580,9 +166953,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -142591,15 +166964,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142610,13 +166981,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -142655,31 +167025,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 908 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1054 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -142698,38 +167070,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1552 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -142741,10 +167113,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142752,13 +167124,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 NumThreads: 128 PackBatchDims: 0 @@ -142777,7 +167147,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -142816,31 +167185,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 909 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1055 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -142859,53 +167230,53 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 8 LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LSPA: 16 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1296 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142913,15 +167284,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142938,7 +167307,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -142977,31 +167345,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 910 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1056 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -143020,53 +167390,53 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 8 LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LSPA: 16 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1296 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143074,15 +167444,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -143099,7 +167467,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -143138,31 +167505,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 911 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1057 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -143181,52 +167550,52 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 4 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 + LSPA: 16 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2064 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 64 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 4 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 4 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -143235,14 +167604,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -143254,13 +167621,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -143299,31 +167665,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 912 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1058 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [2, 32, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -143335,60 +167703,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1552 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 64 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 64 + MacroTileA: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143396,14 +167764,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -143421,7 +167787,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -143460,31 +167825,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 913 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1059 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -143503,38 +167870,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1296 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -143546,10 +167913,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143557,13 +167924,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 @@ -143576,13 +167941,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -143621,31 +167985,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 914 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1060 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -143657,59 +168023,59 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 4 - LVPB: 8 + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3104 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -143718,14 +168084,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -143737,13 +168101,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -143782,31 +168145,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 915 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1061 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -143818,59 +168183,59 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 - LSPB: 16 - LVCA: 64 + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 4 LVCB: 8 - LVPA: 2 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -143879,15 +168244,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -143904,7 +168267,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -143943,31 +168305,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 916 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 1062 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -143979,60 +168343,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 - LSPB: 16 - LVCA: 64 + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 4 LVCB: 8 - LVPA: 2 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 64 + MacroTileA: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144040,15 +168404,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -144059,13 +168421,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -144104,31 +168465,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 917 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 1063 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -144147,53 +168510,53 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 4 LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LSPA: 16 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1600 + LdsNumElements: 1120 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 64 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 4 + MacroTile1: 64 + MacroTileA: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144201,15 +168564,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -144220,13 +168581,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -144265,31 +168625,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 918 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 1064 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [2, 32, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -144301,15 +168663,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -144317,44 +168679,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 + LSCA: 4 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 + LVCA: 2 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 1120 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 64 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 4 + MacroTile1: 64 + MacroTileA: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144362,15 +168724,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -144387,7 +168747,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -144426,31 +168785,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 919 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 1065 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [2, 32, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -144462,15 +168823,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -144478,44 +168839,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 + LSCA: 4 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 + LVCA: 2 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 3168 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 64 + LdsOffsetB_Blk: 2112 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 4 + MacroTile1: 64 + MacroTileA: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144523,13 +168888,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -144539,14 +168904,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -144585,31 +168949,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 920 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 1066 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [2, 32, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -144623,13 +168987,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -144643,40 +169007,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 + LSCA: 4 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCA: 2 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 3168 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 64 + LdsOffsetB_Blk: 2112 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 4 + MacroTile1: 64 + MacroTileA: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144684,12 +169052,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -144700,7 +169068,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -144745,16 +169113,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 921 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 1067 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: true ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 @@ -144766,10 +169134,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [2, 32, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -144783,7 +169151,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -144808,20 +169176,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 8 - LVCB: 4 - LVPA: 4 + LVCA: 4 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 1344 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -144832,11 +169200,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144849,7 +169217,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -144861,7 +169229,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -144905,15 +169273,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 922 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1068 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -144926,10 +169294,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -144943,13 +169311,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -144968,34 +169336,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -145004,12 +169376,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -145020,8 +169392,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -145065,20 +169437,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 923 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 1069 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -145086,10 +169458,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -145103,13 +169475,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -145123,40 +169495,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -145164,8 +169540,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -145180,8 +169556,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -145225,20 +169601,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 924 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 1070 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -145246,10 +169622,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 8, 8] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -145263,13 +169639,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -145283,36 +169659,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 MacroTile1: 16 MacroTileA: 16 @@ -145324,13 +169704,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -145340,8 +169720,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -145385,20 +169765,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 925 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + SolutionIndex: 1071 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -145406,10 +169786,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] + WorkGroup: [4, 8, 8] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -145423,7 +169803,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -145443,40 +169823,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 2048 LdsOffsetA: 0 LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -145484,13 +169864,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -145545,31 +169925,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 926 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + SolutionIndex: 1072 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] + WorkGroup: [4, 8, 8] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -145583,7 +169963,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -145608,28 +169988,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3392 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 + LdsOffsetA_Blk: 2048 LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -145637,10 +170017,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -145653,7 +170033,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -145709,31 +170089,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 927 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1073 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [4, 8, 8] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -145747,7 +170127,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -145767,25 +170147,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 4 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 2240 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -145796,11 +170176,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 4 + MacroTile1: 64 + MacroTileA: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -145808,12 +170188,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -145869,31 +170249,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 928 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1074 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [2, 32, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -145907,13 +170287,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -145932,34 +170312,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -145968,12 +170352,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -145984,8 +170368,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -146029,20 +170413,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 929 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM8 + SolutionIndex: 1075 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -146050,10 +170434,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -146067,13 +170451,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -146087,40 +170471,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146128,12 +170516,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -146144,8 +170532,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -146189,31 +170577,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 930 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1076 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 8, 8] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -146227,7 +170615,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -146247,40 +170635,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146288,8 +170676,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -146349,15 +170737,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 931 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1077 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [2, 4] ThreadTile0: 2 @@ -146370,10 +170758,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 8, 8] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -146387,13 +170775,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -146412,24 +170800,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -146437,10 +170829,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146453,7 +170845,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -146464,8 +170856,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -146509,31 +170901,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 932 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1078 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] + SuppressNoLoadLoop: true + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [4, 8, 8] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -146553,7 +170945,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -146572,35 +170964,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 8 LSCB: 32 LSPA: 32 LSPB: 16 - LVCA: 8 + LVCA: 4 LVCB: 16 LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146613,7 +171009,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -146624,8 +171020,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -146669,20 +171065,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 933 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1079 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG2_16_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -146690,8 +171086,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [2, 16, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -146727,7 +171123,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -146735,15 +171131,15 @@ LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3328 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -146762,9 +171158,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146772,7 +171168,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -146833,20 +171229,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 934 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1080 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -146854,7 +171250,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -146877,7 +171273,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -146896,20 +171292,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 16 LSPB: 8 - LVCA: 8 + LVCA: 16 LVCB: 16 LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -146921,9 +171321,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 8 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -146938,7 +171338,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -146948,7 +171348,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -146993,16 +171393,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 935 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1081 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 4 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 4 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 @@ -147014,7 +171414,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -147051,29 +171451,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPB: 16 + LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -147085,10 +171485,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147096,13 +171496,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -147157,28 +171557,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 936 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1082 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 4 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -147201,7 +171601,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -147215,40 +171615,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPB: 16 + LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147256,13 +171660,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -147272,7 +171676,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -147317,16 +171721,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 937 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1083 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 @@ -147338,8 +171742,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -147372,47 +171776,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPB: 16 + LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147420,13 +171824,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -147481,28 +171885,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 938 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB0_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1084 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -147519,13 +171923,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -147539,40 +171943,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1056 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147580,11 +171988,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -147596,7 +172004,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -147641,20 +172049,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 939 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 1085 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -147662,10 +172070,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 4, 8] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -147679,13 +172087,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -147699,40 +172107,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1568 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147740,12 +172152,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -147756,8 +172168,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -147801,16 +172213,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 940 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1086 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: true ThreadTile: [2, 4] ThreadTile0: 2 ThreadTile1: 4 @@ -147822,10 +172234,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 4, 8] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -147839,13 +172251,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -147859,40 +172271,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1568 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147900,12 +172316,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -147916,7 +172332,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -147961,31 +172377,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 941 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1087 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -147999,13 +172415,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -148024,24 +172440,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 544 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -148049,9 +172469,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -148064,9 +172484,9 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -148076,7 +172496,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -148121,31 +172541,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 942 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + SolutionIndex: 1088 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -148159,13 +172579,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -148184,34 +172604,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 + LSCA: 64 + LSCB: 32 + LSPA: 8 LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -148220,11 +172644,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -148236,7 +172660,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -148281,16 +172705,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 943 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1089 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 @@ -148302,10 +172726,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -148319,7 +172743,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -148339,33 +172763,33 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCA: 8 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3136 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -148373,9 +172797,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 32 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -148384,7 +172808,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -148445,31 +172869,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 944 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1090 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -148508,38 +172932,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 16 - LVCA: 16 + LSPB: 32 + LVCA: 32 LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -148553,8 +172977,8 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -148609,16 +173033,180 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 945 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 1091 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG32_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [32, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1092 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 4 + SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 @@ -148630,7 +173218,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -148664,7 +173252,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -148693,14 +173281,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 MacroTile1: 16 MacroTileA: 16 @@ -148773,28 +173361,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 946 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1093 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 4, 8] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -148828,7 +173416,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -148845,7 +173433,7 @@ LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 2048 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 @@ -148853,14 +173441,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 MacroTile1: 16 MacroTileA: 16 @@ -148933,28 +173521,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 947 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1094 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 4, 8] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -148988,7 +173576,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -149017,14 +173605,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 MacroTile1: 16 MacroTileA: 16 @@ -149097,28 +173685,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 948 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1095 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 4, 8] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -149152,46 +173740,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 8 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -149200,11 +173788,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -149217,7 +173805,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -149261,29 +173849,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 949 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1096 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -149299,13 +173887,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -149324,24 +173912,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -149349,10 +173941,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149360,11 +173952,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -149376,7 +173968,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -149421,31 +174013,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 950 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1097 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -149459,7 +174051,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -149484,28 +174076,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -149513,10 +174105,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149524,11 +174116,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -149585,31 +174177,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 951 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1098 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -149623,13 +174215,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -149648,28 +174240,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -149677,10 +174265,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149688,12 +174276,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -149704,7 +174292,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -149749,20 +174337,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 952 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1099 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -149770,10 +174358,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -149787,7 +174375,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -149812,35 +174400,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149848,11 +174436,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -149865,7 +174453,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -149909,20 +174497,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 953 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1100 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -149930,10 +174518,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -149947,7 +174535,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -149955,56 +174543,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150012,12 +174600,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -150073,31 +174661,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 954 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1101 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -150111,7 +174699,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -150119,56 +174707,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150176,13 +174764,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -150237,31 +174825,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 955 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1102 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -150283,41 +174871,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -150330,9 +174918,9 @@ LoopTail: true LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150340,11 +174928,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -150401,8 +174989,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 956 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM8 + SolutionIndex: 1103 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -150411,19 +174999,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -150439,13 +175027,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -150459,29 +175047,33 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 3088 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -150489,10 +175081,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150500,8 +175092,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -150516,7 +175108,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -150561,31 +175153,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 957 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1104 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -150599,13 +175191,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -150619,33 +175211,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1040 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -150653,10 +175241,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150664,8 +175252,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -150680,7 +175268,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -150725,31 +175313,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 958 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1105 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -150763,13 +175351,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -150788,24 +175376,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3088 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -150813,10 +175405,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150824,11 +175416,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -150840,7 +175432,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -150885,31 +175477,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 959 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1106 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -150923,7 +175515,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -150948,16 +175540,16 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPB: 32 + LVCA: 32 + LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 8192 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 @@ -150976,11 +175568,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150988,13 +175580,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -151049,31 +175641,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 960 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1107 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151087,13 +175679,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -151112,39 +175704,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151152,11 +175740,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -151168,8 +175756,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -151213,20 +175801,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 961 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1108 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -151234,10 +175822,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151251,7 +175839,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -151276,22 +175864,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -151304,10 +175892,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 + LoopUnroll: 4 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -151316,13 +175904,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -151377,20 +175965,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 962 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1109 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -151398,10 +175986,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151415,7 +176003,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -151435,21 +176023,21 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 + LSPB: 64 + LVCA: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 832 + LdsNumElements: 8192 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 @@ -151464,11 +176052,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151476,13 +176064,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -151493,7 +176081,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -151537,31 +176125,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 963 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1110 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG8_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [8, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151575,7 +176163,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -151595,27 +176183,27 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 + LSPB: 64 + LVCA: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1856 + LdsNumElements: 8192 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -151628,11 +176216,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151640,13 +176228,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -151701,31 +176289,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 964 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1111 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [8, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151739,13 +176327,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -151756,7 +176344,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -151764,28 +176352,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -151793,10 +176377,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151804,13 +176388,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -151820,8 +176404,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -151865,31 +176449,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 965 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + SolutionIndex: 1112 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151903,13 +176487,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -151920,47 +176504,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151968,12 +176548,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -151984,8 +176564,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -152029,31 +176609,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 966 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1113 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -152084,30 +176664,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 LSPA: 32 - LSPB: 16 - LVCA: 8 + LSPB: 64 + LVCA: 32 LVCB: 16 LVPA: 16 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -152121,10 +176701,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -152132,13 +176712,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -152193,28 +176773,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 967 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1114 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -152248,7 +176828,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -152256,22 +176836,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 + LSPA: 16 + LSPB: 32 + LVCA: 32 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -152285,9 +176865,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -152296,13 +176876,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -152357,20 +176937,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 968 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1115 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -152378,7 +176958,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -152412,47 +176992,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 32 LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPB: 64 + LVCA: 64 LVCB: 16 LVPA: 8 - LVPB: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -152460,13 +177040,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -152477,7 +177057,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -152521,28 +177101,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 969 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1116 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG32_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 32 + SubGroup1: 16 + SubGroupA: 32 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [32, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -152576,30 +177156,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPA: 32 + LSPB: 64 + LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 4 + LVPA: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -152613,10 +177193,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -152624,13 +177204,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -152685,29 +177265,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 970 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1117 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -152740,7 +177320,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -152748,22 +177328,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPB: 32 + LVCA: 32 LVCB: 16 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -152777,10 +177357,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -152788,13 +177368,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -152849,20 +177429,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 971 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_4_WGM1 + SolutionIndex: 1118 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -152870,8 +177450,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -152887,7 +177467,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -152895,56 +177475,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -152952,8 +177532,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -152969,7 +177549,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -153013,31 +177593,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 972 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1119 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -153051,60 +177631,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153112,11 +177696,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -153128,7 +177712,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -153173,31 +177757,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 973 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1120 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -153211,64 +177795,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153276,8 +177856,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -153292,8 +177872,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -153337,31 +177917,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 974 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1121 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -153375,7 +177955,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -153383,56 +177963,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153440,11 +178020,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -153457,7 +178037,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -153501,31 +178081,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 975 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1122 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -153539,7 +178119,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -153547,56 +178127,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 7232 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153604,12 +178184,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -153665,31 +178245,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 976 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1123 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -153703,7 +178283,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -153711,56 +178291,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153768,12 +178348,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -153829,31 +178409,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 977 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1124 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -153875,56 +178455,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153932,13 +178512,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -153949,7 +178529,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -153993,29 +178573,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 978 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1125 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -154039,7 +178619,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -154047,33 +178627,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPA: 32 + LSPB: 32 + LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -154085,10 +178665,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154096,13 +178676,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154157,29 +178737,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 979 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1126 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -154195,60 +178775,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 + LSCA: 64 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154256,13 +178840,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154272,8 +178856,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -154317,31 +178901,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 980 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 + SolutionIndex: 1127 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -154355,15 +178939,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -154371,44 +178955,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 128 + LSCB: 32 LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPB: 32 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154417,12 +179005,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154432,7 +179020,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -154477,31 +179065,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 981 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_16_1_WGM1 + SolutionIndex: 1128 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -154515,45 +179103,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 4 + LSCA: 64 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -154564,11 +179156,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154576,13 +179168,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154592,8 +179184,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -154637,31 +179229,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 982 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1129 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [8, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -154675,15 +179267,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -154691,44 +179283,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 4 - LVCB: 8 + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 32 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154736,13 +179332,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154752,7 +179348,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -154797,31 +179393,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 983 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1130 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 32 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 32 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [32, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -154835,15 +179431,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -154851,29 +179447,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 16 - LSPA: 16 + LSCA: 64 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 2 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 64 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -154884,11 +179484,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 - MacroTile1: 64 - MacroTileA: 4 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154896,13 +179496,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154912,8 +179512,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -154957,31 +179557,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 984 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1131 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 32, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -154995,45 +179595,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 + LSCA: 64 + LSCB: 32 + LSPA: 32 LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 64 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -155044,10 +179648,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 8 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -155056,13 +179660,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155072,7 +179676,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -155117,31 +179721,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 985 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1132 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155155,15 +179759,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -155171,33 +179775,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 32 + LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -155205,10 +179813,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -155216,13 +179824,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularB: 1 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155232,8 +179840,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -155277,31 +179885,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 986 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 + SolutionIndex: 1133 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 32 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155315,60 +179923,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 LSPB: 32 - LVCA: 4 + LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -155376,12 +179988,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -155392,8 +180004,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -155437,31 +180049,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 987 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1134 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155475,15 +180087,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -155491,29 +180103,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 16 - LSPA: 16 + LSCA: 64 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -155524,10 +180140,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 8 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -155536,13 +180152,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155552,8 +180168,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -155597,31 +180213,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 988 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1135 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR0_SNLL1_TT4_4_VW4_WG16_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155635,45 +180251,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 4 + LSCA: 64 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -155684,10 +180304,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 8 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -155696,13 +180316,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155712,8 +180332,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -155757,31 +180377,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 989 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1136 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155795,45 +180415,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 2 + LSCA: 64 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1120 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 64 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -155844,10 +180468,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 4 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -155856,13 +180480,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155872,8 +180496,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -155917,31 +180541,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 990 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1137 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 32, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155955,15 +180579,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -155971,43 +180595,47 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 2 - LVCB: 8 + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 32 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1120 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 64 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 4 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -156016,13 +180644,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -156032,7 +180660,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -156077,31 +180705,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 991 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1138 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 32 + SubGroup1: 16 + SubGroupA: 32 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 32, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -156115,13 +180743,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -156135,29 +180763,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 2 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3168 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 64 - LdsOffsetB_Blk: 2112 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -156168,10 +180792,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 + LoopUnroll: 2 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 4 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -156180,12 +180804,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -156196,8 +180820,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -156241,31 +180865,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 992 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1139 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [2, 32, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -156279,13 +180903,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -156299,29 +180923,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 2 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3168 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 64 - LdsOffsetB_Blk: 2112 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -156332,10 +180952,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 + LoopUnroll: 2 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 4 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -156344,12 +180964,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -156360,7 +180980,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -156405,31 +181025,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 993 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM8 + SolutionIndex: 1140 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [2, 32, 4] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -156449,54 +181069,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1344 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156504,11 +181128,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -156520,7 +181144,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -156565,29 +181189,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 994 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1141 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -156603,13 +181227,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -156628,39 +181252,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156668,12 +181288,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -156684,7 +181304,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -156729,31 +181349,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 995 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1142 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -156767,13 +181387,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -156787,33 +181407,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -156821,10 +181437,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156832,8 +181448,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -156848,8 +181464,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -156893,31 +181509,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 996 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 + SolutionIndex: 1143 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -156931,7 +181547,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -156951,33 +181567,33 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -156985,10 +181601,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156996,8 +181612,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -157057,31 +181673,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 997 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 + SolutionIndex: 1144 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -157095,7 +181711,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -157115,40 +181731,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -157156,8 +181772,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -157217,20 +181833,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 998 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM1 + SolutionIndex: 1145 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -157238,10 +181854,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 8, 8] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -157255,7 +181871,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -157275,44 +181891,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -157320,8 +181936,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -157381,20 +181997,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 999 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM1 + SolutionIndex: 1146 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -157402,10 +182018,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 8, 8] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -157419,7 +182035,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -157439,23 +182055,23 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 2 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2240 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -157468,10 +182084,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 4 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 4 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -157480,12 +182096,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -157541,31 +182157,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1000 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1147 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [2, 32, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -157579,13 +182195,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -157604,28 +182220,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -157633,10 +182245,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -157644,8 +182256,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -157660,7 +182272,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -157705,31 +182317,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1001 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM8 + SolutionIndex: 1148 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -157743,7 +182355,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -157763,23 +182375,23 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 @@ -157789,18 +182401,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -157808,12 +182420,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -157869,31 +182481,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1002 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM8 + SolutionIndex: 1149 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 8, 8] + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -157907,13 +182519,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -157927,40 +182539,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -157968,8 +182584,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -157984,7 +182600,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -158029,20 +182645,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1003 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM8 + SolutionIndex: 1150 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -158050,10 +182666,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -158087,27 +182703,27 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 16 LSCB: 32 LSPA: 32 LSPB: 16 - LVCA: 4 + LVCA: 8 LVCB: 16 LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -158121,9 +182737,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 8 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -158132,7 +182748,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -158193,8 +182809,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1004 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM8 + SolutionIndex: 1151 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -158203,10 +182819,10 @@ SubGroupA: 4 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -158231,7 +182847,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -158239,56 +182855,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -158296,12 +182912,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -158357,31 +182973,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1005 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG2_16_8_WGM8 + SolutionIndex: 1152 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 2 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 16, 8] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -158395,7 +183011,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -158403,45 +183019,45 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 8192 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -158449,10 +183065,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -158460,11 +183076,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -158521,31 +183137,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1006 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 + SolutionIndex: 1153 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -158559,7 +183175,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -158567,45 +183183,45 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 8192 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -158613,10 +183229,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -158624,11 +183240,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -158685,31 +183301,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1007 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 + SolutionIndex: 1154 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -158731,56 +183347,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -158788,12 +183404,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -158849,28 +183465,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1008 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM1 + SolutionIndex: 1155 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -158895,41 +183511,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -158942,9 +183558,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -158952,12 +183568,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -159013,29 +183629,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1009 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_8_2_WGM1 + SolutionIndex: 1156 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -159059,52 +183675,52 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -159117,11 +183733,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -159177,29 +183793,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1010 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_8_2_WGM1 + SolutionIndex: 1157 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -159215,7 +183831,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -159235,33 +183851,33 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -159270,9 +183886,9 @@ LoopTail: true LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -159280,11 +183896,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -159341,31 +183957,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1011 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_4_8_WGM1 + SolutionIndex: 1158 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 8] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -159379,13 +183995,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -159399,33 +184015,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -159433,10 +184045,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -159444,8 +184056,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -159460,7 +184072,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -159505,20 +184117,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1012 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM1 + SolutionIndex: 1159 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -159526,10 +184138,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -159543,7 +184155,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -159563,27 +184175,27 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -159596,11 +184208,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -159608,11 +184220,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -159669,31 +184281,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1013 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM8 + SolutionIndex: 1160 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -159741,13 +184353,13 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -159762,9 +184374,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -159772,12 +184384,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -159833,20 +184445,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1014 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM8 + SolutionIndex: 1161 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -159854,8 +184466,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -159896,39 +184508,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -159940,8 +184552,8 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -159997,28 +184609,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1015 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_8_2_WGM8 + SolutionIndex: 1162 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -160055,44 +184667,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -160100,12 +184712,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -160161,20 +184773,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1016 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM8 + SolutionIndex: 1163 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -160182,8 +184794,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -160199,7 +184811,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -160207,7 +184819,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -160215,33 +184827,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 32 - LVCB: 16 - LVPA: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -160252,7 +184864,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -160264,13 +184876,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -160325,31 +184937,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1017 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG32_8_2_WGM8 + SolutionIndex: 1164 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -160363,7 +184975,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -160371,7 +184983,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -160379,37 +184991,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -160417,10 +185029,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -160428,11 +185040,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -160445,7 +185057,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -160489,31 +185101,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1018 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG16_4_4_WGM1 + SolutionIndex: 1165 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -160527,7 +185139,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -160535,7 +185147,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -160543,48 +185155,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -160592,8 +185204,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -160609,7 +185221,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -160653,31 +185265,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1019 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 + SolutionIndex: 1166 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -160691,60 +185303,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -160752,8 +185368,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -160768,7 +185384,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -160813,31 +185429,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1020 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 + SolutionIndex: 1167 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -160859,56 +185475,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -160916,11 +185532,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -160977,28 +185593,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1021 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 + SolutionIndex: 1168 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -161023,7 +185639,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -161031,48 +185647,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 + LSPA: 16 + LSPB: 32 + LVCA: 32 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161080,13 +185696,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -161141,29 +185757,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1022 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM8 + SolutionIndex: 1169 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG32_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 32 SubGroup1: 4 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [32, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -161179,7 +185795,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -161187,45 +185803,45 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -161233,10 +185849,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161244,8 +185860,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -161305,15 +185921,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1023 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1170 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -161325,11 +185941,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -161343,7 +185959,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -161351,56 +185967,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161408,11 +186024,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -161469,15 +186085,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1024 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1171 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -161489,11 +186105,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -161507,7 +186123,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -161515,41 +186131,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -161557,10 +186173,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161568,8 +186184,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -161585,7 +186201,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -161629,15 +186245,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1025 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1172 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR1_SNLL0_TT4_4_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -161649,11 +186265,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -161667,60 +186283,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 + LSCA: 32 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161728,12 +186348,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -161744,8 +186364,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -161789,16 +186409,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1026 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1173 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -161809,11 +186429,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -161827,7 +186447,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -161852,39 +186472,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161892,8 +186512,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -161953,15 +186573,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1027 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1174 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [8, 4] ThreadTile0: 8 @@ -161974,10 +186594,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -161991,7 +186611,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -162017,27 +186637,27 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -162046,9 +186666,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -162056,11 +186676,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -162117,15 +186737,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1028 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1175 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -162138,10 +186758,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -162180,39 +186800,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -162220,12 +186840,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -162281,15 +186901,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1029 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1176 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -162302,7 +186922,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -162319,13 +186939,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -162336,7 +186956,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -162345,38 +186965,34 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 + LSCB: 32 LSPA: 8 - LSPB: 64 + LSPB: 16 LVCA: 32 - LVCB: 4 + LVCB: 16 LVPA: 4 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3088 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -162384,12 +187000,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -162400,7 +187016,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -162445,16 +187061,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1030 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1177 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -162466,10 +187082,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -162483,7 +187099,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -162500,7 +187116,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -162509,34 +187125,34 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 + LSCB: 32 LSPA: 8 - LSPB: 64 + LSPB: 16 LVCA: 32 - LVCB: 4 + LVCB: 16 LVPA: 4 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1040 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -162544,12 +187160,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -162605,15 +187221,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1031 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1178 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -162626,10 +187242,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -162643,7 +187259,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -162660,7 +187276,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -162668,28 +187284,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3088 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -162697,10 +187313,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -162708,12 +187324,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -162769,16 +187385,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1032 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1179 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -162790,10 +187406,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -162807,7 +187423,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -162824,7 +187440,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -162832,22 +187448,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 32 - LVCA: 32 - LVCB: 8 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -162860,10 +187476,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -162876,9 +187492,9 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -162933,16 +187549,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1033 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 + SolutionIndex: 1180 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -162954,10 +187570,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -162971,7 +187587,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -162996,35 +187612,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 16 + LSCB: 8 LSPA: 8 LSPB: 32 - LVCA: 32 - LVCB: 8 + LVCA: 8 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -163032,12 +187648,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -163049,7 +187665,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -163093,31 +187709,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1034 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1181 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -163131,13 +187747,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -163156,38 +187772,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 32 + LSCB: 8 + LSPA: 8 LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -163202,7 +187814,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -163212,7 +187824,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -163257,31 +187869,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1035 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 + SolutionIndex: 1182 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -163323,32 +187935,32 @@ LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 1024 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -163362,7 +187974,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -163417,29 +188029,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1036 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG8_16_4_WGM8 + SolutionIndex: 1183 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -163461,7 +188073,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -163483,36 +188095,32 @@ LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -163526,7 +188134,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -163536,7 +188144,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -163581,29 +188189,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1037 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_4_WGM8 + SolutionIndex: 1184 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -163619,7 +188227,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -163644,18 +188252,18 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 8 LSPA: 8 LSPB: 32 - LVCA: 32 - LVCB: 8 + LVCA: 16 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 528 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -163668,11 +188276,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -163680,12 +188288,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -163697,7 +188305,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -163741,8 +188349,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1038 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1185 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -163751,11 +188359,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -163765,7 +188373,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -163804,35 +188412,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -163840,12 +188448,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -163857,7 +188465,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -163901,29 +188509,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1039 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1186 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -163939,13 +188547,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -163964,28 +188572,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 - LSPB: 64 - LVCA: 32 - LVCB: 16 - LVPA: 16 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -163993,10 +188597,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -164004,13 +188608,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -164020,7 +188624,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -164065,31 +188669,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1040 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM1 + SolutionIndex: 1187 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 4] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -164103,7 +188707,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -164128,28 +188732,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 32 - LVCB: 16 + LVCA: 16 + LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -164157,9 +188761,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -164172,9 +188776,9 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -164229,31 +188833,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1041 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 + SolutionIndex: 1188 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 4] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -164267,13 +188871,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -164292,28 +188896,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 - LSPB: 64 - LVCA: 64 - LVCB: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 8 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1568 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -164321,9 +188921,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -164336,9 +188936,9 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -164348,8 +188948,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -164393,20 +188993,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1042 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG32_16_2_WGM1 + SolutionIndex: 1189 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -164414,10 +189014,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -164437,7 +189037,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -164451,27 +189051,23 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 LSPA: 32 - LSPB: 64 - LVCA: 32 + LSPB: 16 + LVCA: 8 LVCB: 16 LVPA: 16 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -164485,10 +189081,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -164496,13 +189092,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -164512,7 +189108,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -164557,29 +189153,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1043 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM8 + SolutionIndex: 1190 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -164615,27 +189211,27 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 LVCB: 16 - LVPA: 8 - LVPB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -164649,10 +189245,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -164660,13 +189256,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -164721,29 +189317,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1044 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 + SolutionIndex: 1191 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -164759,7 +189355,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -164767,56 +189363,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 16 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -164824,11 +189420,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -164885,31 +189481,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1045 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1192 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -164923,7 +189519,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -164931,56 +189527,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -164988,8 +189584,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -165049,31 +189645,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1046 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1193 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -165087,60 +189683,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 16 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -165148,11 +189748,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -165164,8 +189764,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -165209,31 +189809,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1047 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1194 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -165247,7 +189847,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -165256,7 +189856,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -165273,38 +189873,38 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 16 LVCA: 16 - LVCB: 4 + LVCB: 16 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -165312,11 +189912,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -165373,15 +189973,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1048 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1195 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -165394,10 +189994,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -165411,7 +190011,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -165420,7 +190020,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -165436,28 +190036,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 128 + LSCB: 32 LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSPB: 32 + LVCA: 32 + LVCB: 16 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -165465,10 +190065,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -165476,13 +190076,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -165537,31 +190137,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1049 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM16 + SolutionIndex: 1196 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [32, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -165575,7 +190175,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -165584,7 +190184,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -165600,18 +190200,18 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 6784 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -165621,18 +190221,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -165640,8 +190240,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -165701,20 +190301,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1050 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1197 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -165722,10 +190322,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -165748,7 +190348,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -165767,36 +190367,36 @@ LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 12416 + LdsNumElements: 6784 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -165804,12 +190404,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -165865,15 +190465,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1051 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1198 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -165886,8 +190486,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -165928,38 +190528,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 4096 LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -165968,11 +190568,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 512 PackBatchDims: 0 @@ -166029,14 +190629,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1052 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM1 + SolutionIndex: 1199 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] @@ -166050,8 +190650,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -166059,7 +190659,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166067,14 +190667,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -166091,36 +190691,37 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 32 + LSCB: 16 + LSPA: 16 LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -166132,13 +190733,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166148,6 +190751,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -166155,6 +190759,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166193,37 +190798,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1053 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM1 + SolutionIndex: 1200 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166231,7 +190834,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -166240,7 +190843,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -166255,40 +190858,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 32 - LVCB: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -166296,13 +190900,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166312,6 +190916,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -166319,6 +190924,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166357,20 +190963,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1054 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_8_4_WGM1 + SolutionIndex: 1201 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -166378,16 +190984,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166395,7 +191001,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -166403,45 +191009,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 + LSCA: 96 + LSCB: 8 + LSPA: 5 LSPB: 64 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -166449,9 +191056,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -166460,13 +191067,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166476,13 +191083,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166521,37 +191130,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1055 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM1 + SolutionIndex: 1202 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 4] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166559,7 +191168,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -166567,7 +191176,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -166575,47 +191184,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 32 + LSCA: 96 + LSCB: 8 + LSPA: 5 LSPB: 64 - LVCA: 32 - LVCB: 16 - LVPA: 8 + LVCA: 48 + LVCB: 4 + LVPA: 3 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -166624,13 +191234,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166640,13 +191250,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166685,37 +191297,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1056 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM1 + SolutionIndex: 1203 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166723,7 +191335,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -166731,7 +191343,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -166739,37 +191351,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -166777,10 +191390,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -166788,13 +191401,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166804,6 +191417,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -166811,6 +191425,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166849,37 +191464,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1057 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM8 + SolutionIndex: 1204 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166887,7 +191502,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -166895,45 +191510,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 + LSCA: 96 + LSCB: 8 + LSPA: 5 LSPB: 64 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -166941,9 +191557,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -166952,13 +191568,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166968,6 +191584,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -166975,6 +191592,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167013,37 +191631,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1058 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM8 + SolutionIndex: 1205 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 4] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -167051,15 +191669,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -167067,37 +191685,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 32 - LSPB: 64 + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -167105,10 +191724,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -167116,13 +191735,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167132,6 +191753,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -167139,6 +191761,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167177,37 +191800,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1059 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM8 + SolutionIndex: 1206 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -167215,7 +191836,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -167223,7 +191844,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -167235,29 +191856,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 LVPA: 4 - LVPB: 8 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 12416 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -167268,11 +191890,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -167280,12 +191902,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -167296,6 +191918,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -167303,6 +191926,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167341,8 +191965,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1060 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1207 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -167350,28 +191974,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -167379,53 +192003,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 + LSCA: 128 + LSCB: 8 + LSPA: 2 LSPB: 32 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -167433,10 +192058,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -167444,13 +192069,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167460,13 +192087,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167505,37 +192134,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1061 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR0_SNLL1_TT4_4_VW4_WG16_8_4_WGM16 + SolutionIndex: 1208 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -167543,53 +192170,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 + LSCA: 128 + LSCB: 8 + LSPA: 4 LSPB: 64 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -167597,9 +192225,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -167608,13 +192236,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167624,13 +192254,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167669,37 +192301,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1062 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM16 + SolutionIndex: 1209 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -167707,53 +192337,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 + LSCA: 128 + LSCB: 8 + LSPA: 4 LSPB: 64 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -167761,9 +192392,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -167772,13 +192403,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167788,6 +192421,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -167795,6 +192429,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167833,16 +192468,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1063 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM16 + SolutionIndex: 1210 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -167853,17 +192488,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 4] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -167871,14 +192504,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -167895,36 +192528,37 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 - LSPA: 32 + LSCB: 8 + LSPA: 8 LSPB: 64 LVCA: 32 - LVCB: 16 - LVPA: 8 + LVCB: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -167936,13 +192570,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167952,13 +192588,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167997,20 +192635,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1064 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM16 + SolutionIndex: 1211 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -168018,16 +192656,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -168041,9 +192677,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -168051,43 +192687,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 8 LSPA: 8 LSPB: 64 - LVCA: 8 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -168096,8 +192737,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -168112,13 +192755,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -168157,37 +192802,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1065 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1212 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -168201,9 +192844,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -168211,43 +192854,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 8 LSPA: 8 LSPB: 64 - LVCA: 8 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -168256,8 +192904,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -168272,13 +192922,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -168317,37 +192969,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1066 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1213 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -168355,14 +193005,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -168379,19 +193029,20 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -168401,7 +193052,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -168409,9 +193060,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -168420,12 +193071,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -168436,6 +193089,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -168443,6 +193097,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -168481,16 +193136,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1067 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 + SolutionIndex: 1214 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -168502,16 +193157,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -168525,7 +193178,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -168543,35 +193196,40 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 16 + LVCA: 64 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -168580,11 +193238,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -168596,13 +193254,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -168641,20 +193301,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1068 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 + SolutionIndex: 1215 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -168662,7 +193322,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -168671,7 +193331,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -168685,7 +193345,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -168703,35 +193363,40 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 16 + LVCA: 64 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -168740,11 +193405,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -168756,13 +193421,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -168801,20 +193468,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1069 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM16 + SolutionIndex: 1216 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -168822,8 +193489,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -168831,7 +193498,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -168847,7 +193514,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -168855,47 +193522,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 8 LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -168904,8 +193572,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -168920,13 +193588,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -168965,29 +193635,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1070 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 + SolutionIndex: 1217 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -168995,7 +193665,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -169003,15 +193673,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -169019,33 +193689,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -169053,9 +193728,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -169064,12 +193739,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -169080,13 +193755,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -169125,37 +193802,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1071 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM1 + SolutionIndex: 1218 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -169163,7 +193840,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -169171,7 +193848,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -169179,37 +193856,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -169217,9 +193895,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -169228,12 +193906,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -169244,6 +193922,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -169251,6 +193930,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -169289,37 +193969,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1072 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM1 + SolutionIndex: 1219 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -169327,15 +194007,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -169343,43 +194023,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -169388,12 +194073,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -169404,13 +194089,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -169449,37 +194136,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1073 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1220 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -169493,53 +194180,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -169548,12 +194240,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -169564,13 +194256,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -169609,28 +194303,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1074 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 + SolutionIndex: 1221 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -169639,7 +194333,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -169647,53 +194341,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 2 LSPB: 32 - LVCA: 16 + LVCA: 128 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -169701,10 +194396,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -169712,12 +194407,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -169728,6 +194425,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -169735,6 +194433,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -169773,37 +194472,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1075 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM8 + SolutionIndex: 1222 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -169811,14 +194508,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -169835,29 +194532,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -169865,10 +194563,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -169876,11 +194574,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -169892,13 +194592,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -169937,37 +194639,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1076 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 + SolutionIndex: 1223 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -169975,14 +194675,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -169999,40 +194699,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170040,11 +194741,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -170056,6 +194759,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -170063,6 +194767,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -170101,37 +194806,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1077 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG4_8_8_WGM8 + SolutionIndex: 1224 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 8, 8] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -170139,14 +194842,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -170163,29 +194866,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -170193,10 +194897,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170204,8 +194908,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -170220,13 +194926,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -170265,37 +194973,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1078 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM1 + SolutionIndex: 1225 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -170303,14 +195009,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -170327,29 +195033,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -170357,10 +195064,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170368,8 +195075,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -170384,13 +195093,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -170429,37 +195140,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1079 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM8 + SolutionIndex: 1226 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -170467,14 +195176,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -170491,29 +195200,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -170521,10 +195231,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170532,8 +195242,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -170548,6 +195260,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -170555,6 +195268,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -170593,37 +195307,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1080 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 + SolutionIndex: 1227 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -170631,14 +195343,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -170655,40 +195367,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170696,12 +195409,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -170712,6 +195427,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -170719,6 +195435,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -170757,37 +195474,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1081 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM1 + SolutionIndex: 1228 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -170795,14 +195510,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -170819,29 +195534,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -170849,10 +195565,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170860,11 +195576,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -170876,6 +195594,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -170883,6 +195602,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -170921,37 +195641,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1082 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM16 + SolutionIndex: 1229 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -170959,7 +195677,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -170983,15 +195701,16 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -171005,7 +195724,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -171013,10 +195732,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -171024,8 +195743,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -171040,13 +195759,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -171085,37 +195806,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1083 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 + SolutionIndex: 1230 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -171123,7 +195844,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -171131,56 +195852,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -171188,8 +195910,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -171204,6 +195926,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -171211,6 +195934,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -171249,37 +195973,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1084 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1231 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -171287,56 +196011,57 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 - LSPA: 16 + LSCB: 8 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 LVPA: 8 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -171350,6 +196075,8 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -171364,13 +196091,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -171409,37 +196138,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1085 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1232 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -171447,14 +196174,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -171471,36 +196198,37 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -171514,10 +196242,12 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -171528,6 +196258,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -171535,6 +196266,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -171573,37 +196305,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1086 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 + SolutionIndex: 1233 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -171635,6 +196365,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 @@ -171657,14 +196388,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -171692,6 +196423,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -171699,6 +196431,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -171737,28 +196470,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1087 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1234 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -171767,7 +196500,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -171799,6 +196532,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 @@ -171809,30 +196543,30 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -171840,12 +196574,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -171856,6 +196590,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -171863,6 +196598,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -171901,28 +196637,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1088 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1235 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -171931,7 +196667,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -171946,7 +196682,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -171963,6 +196699,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 @@ -171985,14 +196722,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -172006,6 +196743,8 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -172020,6 +196759,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -172027,6 +196767,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -172065,20 +196806,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1089 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 + SolutionIndex: 1236 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -172086,16 +196827,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172110,8 +196849,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -172123,29 +196862,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -172157,9 +196897,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -172168,8 +196908,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -172184,6 +196926,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -172191,6 +196934,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -172229,8 +196973,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1090 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 + SolutionIndex: 1237 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -172238,28 +196982,26 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172274,7 +197016,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -172291,6 +197033,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 16 @@ -172334,6 +197077,8 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -172348,6 +197093,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -172355,6 +197101,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -172393,8 +197140,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1091 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 1238 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -172402,7 +197149,7 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -172418,12 +197165,10 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172431,16 +197176,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -172455,23 +197200,24 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -172484,7 +197230,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -172498,9 +197244,11 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -172512,6 +197260,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -172519,6 +197268,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -172557,8 +197307,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1092 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 1239 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -172566,7 +197316,7 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -172579,15 +197329,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172595,7 +197343,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -172603,41 +197351,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -172648,11 +197397,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -172660,12 +197409,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -172676,6 +197425,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -172683,6 +197433,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -172721,8 +197472,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1093 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 + SolutionIndex: 1240 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -172730,28 +197481,28 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172759,7 +197510,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -172768,7 +197519,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -172783,23 +197534,24 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -172812,7 +197564,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -172828,7 +197580,7 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -172840,6 +197592,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -172847,6 +197600,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -172885,8 +197639,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1094 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 + SolutionIndex: 1241 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -172894,7 +197648,7 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -172907,15 +197661,15 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172932,7 +197686,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -172947,39 +197701,40 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 LSPA: 16 LSPB: 32 - LVCA: 32 - LVCB: 16 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -172994,7 +197749,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -173004,6 +197759,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -173011,6 +197767,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -173049,29 +197806,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1095 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG32_4_4_WGM1 + SolutionIndex: 1242 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -173079,7 +197836,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -173094,8 +197851,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -173107,29 +197864,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 8 LSCB: 32 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 + LVPA: 32 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -173141,9 +197899,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -173152,8 +197910,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -173168,6 +197928,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -173175,6 +197936,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -173213,37 +197975,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1096 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM1 + SolutionIndex: 1243 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -173258,56 +198018,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 4 + LVCB: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -173316,12 +198077,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -173332,6 +198095,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -173339,6 +198103,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -173377,37 +198142,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1097 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 1244 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -173421,39 +198184,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -173465,9 +198233,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -173476,12 +198244,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -173492,13 +198262,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -173537,37 +198309,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1098 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR1_SNLL0_TT4_4_VW4_WG8_8_4_WGM8 + SolutionIndex: 1245 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -173583,41 +198353,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 32 LSPA: 32 - LSPB: 32 + LSPB: 16 LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -173629,9 +198400,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -173640,12 +198411,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -173656,6 +198427,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -173663,6 +198435,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -173701,28 +198474,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1099 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM8 + SolutionIndex: 1246 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -173731,7 +198504,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -173747,7 +198520,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -173759,29 +198532,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 4 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -173793,9 +198567,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -173804,11 +198578,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -173820,6 +198594,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -173827,6 +198602,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -173865,28 +198641,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1100 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_8_4_WGM8 + SolutionIndex: 1247 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -173895,7 +198671,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -173910,8 +198686,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -173923,44 +198699,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 4 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 + LdsOffsetB_Blk: 4608 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -173968,12 +198745,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -173984,13 +198763,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -174029,16 +198810,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1101 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 1248 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR0_TT4_4_USFGRO0_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -174049,17 +198830,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -174074,42 +198853,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6272 + LdsNumElements: 3392 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -174122,9 +198902,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -174132,11 +198912,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -174148,6 +198930,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -174155,6 +198938,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -174193,37 +198977,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1102 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 + SolutionIndex: 1249 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -174237,54 +199019,59 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 32 - LVCB: 16 - LVPA: 4 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -174292,8 +199079,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 @@ -174308,13 +199097,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -174353,37 +199144,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1103 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM1 + SolutionIndex: 1250 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO1_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -174397,54 +199186,59 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 32 - LVCB: 16 - LVPA: 4 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -174452,8 +199246,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 @@ -174468,13 +199264,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -174513,37 +199311,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1104 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM8 + SolutionIndex: 1251 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -174558,7 +199354,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -174568,13 +199364,14 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 @@ -174585,13 +199382,13 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -174606,9 +199403,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -174616,12 +199413,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -174632,13 +199431,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -174677,20 +199478,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1105 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1252 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO0_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -174698,16 +199499,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -174715,49 +199514,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -174768,11 +199568,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -174781,11 +199581,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -174796,6 +199598,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -174803,6 +199606,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -174841,15 +199645,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1106 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM16 + SolutionIndex: 1253 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -174861,17 +199665,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -174886,53 +199688,54 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 32 - LVCA: 8 - LVCB: 4 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -174940,13 +199743,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -174956,13 +199762,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -175001,37 +199809,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1107 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1254 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -175046,36 +199852,37 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 32 - LVCA: 16 - LVCB: 4 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -175089,10 +199896,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -175100,13 +199907,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175116,13 +199926,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -175161,8 +199973,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1108 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1255 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -175171,27 +199983,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -175205,37 +200015,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -175249,10 +200064,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -175260,13 +200075,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175276,13 +200094,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -175321,8 +200141,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1109 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1256 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -175331,27 +200151,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -175365,37 +200183,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -175409,10 +200232,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -175420,13 +200243,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175436,13 +200262,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -175481,8 +200309,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1110 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1257 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -175491,27 +200319,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -175525,7 +200351,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -175543,21 +200369,26 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 8 LSPA: 8 - LSPB: 32 - LVCA: 16 + LSPB: 64 + LVCA: 32 LVCB: 4 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 528 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -175569,10 +200400,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -175580,13 +200411,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175596,13 +200428,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -175641,8 +200475,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1111 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1258 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -175651,11 +200485,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -175663,7 +200497,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -175671,7 +200505,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -175685,54 +200519,59 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -175740,13 +200579,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175756,13 +200596,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -175801,28 +200643,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1112 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1259 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -175831,7 +200673,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -175845,8 +200687,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -175863,36 +200705,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -175900,13 +200747,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175916,13 +200766,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -175961,37 +200813,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1113 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1260 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -175999,7 +200849,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -176023,29 +200873,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3104 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -176053,10 +200904,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -176064,13 +200915,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -176080,6 +200932,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -176087,6 +200940,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -176125,37 +200979,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1114 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1261 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -176163,14 +201017,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -176187,21 +201041,26 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1568 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -176212,10 +201071,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -176224,13 +201083,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -176240,13 +201102,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -176285,8 +201149,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1115 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1262 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -176295,10 +201159,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -176307,15 +201171,13 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -176323,15 +201185,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -176339,33 +201201,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -176373,10 +201240,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -176384,13 +201251,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -176400,13 +201270,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -176445,37 +201317,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1116 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM16 + SolutionIndex: 1263 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -176483,15 +201353,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -176499,37 +201369,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -176537,10 +201408,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -176548,13 +201419,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -176564,6 +201438,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -176571,6 +201446,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -176609,37 +201485,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1117 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM16 + SolutionIndex: 1264 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -176647,64 +201521,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -176712,13 +201587,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -176728,6 +201606,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -176735,6 +201614,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -176773,37 +201653,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1118 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM16 + SolutionIndex: 1265 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -176811,7 +201689,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -176819,7 +201697,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -176827,37 +201705,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -176865,10 +201744,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -176876,13 +201755,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -176892,6 +201772,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -176899,6 +201780,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -176937,37 +201819,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1119 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM16 + SolutionIndex: 1266 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -176975,7 +201857,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -176983,7 +201865,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -176991,37 +201873,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -177029,10 +201912,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -177040,13 +201923,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -177056,6 +201940,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -177063,6 +201948,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -177101,37 +201987,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1120 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM16 + SolutionIndex: 1267 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -177139,53 +202025,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -177193,10 +202076,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -177205,12 +202088,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -177220,13 +202106,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -177265,37 +202153,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1121 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM1 + SolutionIndex: 1268 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -177303,63 +202189,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 16 + LSCA: 64 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 32 - LVCB: 16 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 784 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -177369,12 +202252,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -177384,13 +202270,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -177429,37 +202317,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1122 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM1 + SolutionIndex: 1269 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR0_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 2] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -177467,7 +202353,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -177475,8 +202361,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -177487,44 +202373,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LVCB: 4 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -177532,13 +202419,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -177548,6 +202436,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -177555,6 +202444,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -177593,37 +202483,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1123 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM8 + SolutionIndex: 1270 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 4, 4] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -177631,64 +202521,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCB: 16 + LSPA: 4 LSPB: 16 - LVCA: 16 + LVCA: 64 LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -177696,13 +202583,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -177712,13 +202602,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -177757,37 +202649,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1124 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM16 + SolutionIndex: 1271 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -177795,53 +202685,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 LVCB: 16 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -177849,10 +202736,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -177860,13 +202747,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -177876,13 +202766,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -177921,33 +202813,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1125 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM16 + SolutionIndex: 1272 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -177959,13 +202849,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -177986,23 +202876,19 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -178013,7 +202899,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -178027,13 +202913,14 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -178044,7 +202931,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -178090,8 +202977,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1126 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1273 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -178114,7 +203001,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -178132,7 +203019,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -178161,15 +203048,11 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -178199,6 +203082,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -178209,7 +203093,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -178255,8 +203139,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1127 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1274 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -178277,7 +203161,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -178293,54 +203177,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 8 - LSPA: 5 - LSPB: 64 - LVCA: 48 - LVCB: 4 - LVPA: 3 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -178348,10 +203228,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -178359,13 +203239,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -178376,8 +203259,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -178422,33 +203305,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1128 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1275 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -178460,15 +203341,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -178476,38 +203357,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 8 - LSPA: 5 - LSPB: 64 - LVCA: 48 - LVCB: 4 - LVPA: 3 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -178515,10 +203396,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -178526,13 +203407,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -178544,7 +203428,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -178589,33 +203473,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1129 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1276 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -178627,7 +203509,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -178635,7 +203517,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -178643,38 +203525,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 8 - LSPA: 5 - LSPB: 64 - LVCA: 48 - LVCB: 4 - LVPA: 3 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -178682,10 +203564,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -178693,13 +203575,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -178756,32 +203639,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1130 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1277 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -178794,54 +203677,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 8 - LSPA: 5 - LSPB: 64 - LVCA: 48 - LVCB: 4 - LVPA: 3 - LVPB: 32 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -178849,10 +203732,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -178860,13 +203743,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -178923,33 +203809,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1131 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1278 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW4_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -178961,7 +203845,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -178969,57 +203853,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 8 - LSPB: 32 + LSPB: 8 LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6656 + LdsNumElements: 3392 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -179027,15 +203911,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -179092,31 +203977,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1132 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1279 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -179128,7 +204013,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -179137,14 +204022,14 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -179154,28 +204039,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 96 - LVCA: 32 - LVCB: 2 - LVPA: 4 - LVPB: 24 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -179183,10 +204068,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -179194,13 +204079,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -179257,31 +204143,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1133 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1280 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -179295,7 +204181,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -179303,46 +204189,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 2 - LSPB: 32 - LVCA: 128 - LVCB: 8 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -179350,10 +204236,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -179361,15 +204247,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -179426,35 +204313,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1134 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_WG16_16_1_WGM8 + SolutionIndex: 1281 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -179468,42 +204355,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 LSPA: 4 - LSPB: 64 + LSPB: 32 LVCA: 64 - LVCB: 4 - LVPA: 2 + LVCB: 8 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -179517,9 +204400,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -179528,15 +204411,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -179547,7 +204431,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -179593,8 +204477,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1135 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1282 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -179603,17 +204487,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -179636,7 +204520,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -179645,7 +204529,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -179655,22 +204539,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -179684,9 +204568,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -179695,15 +204579,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -179760,8 +204643,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1136 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1283 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -179770,10 +204653,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -179785,10 +204668,12 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -179804,30 +204689,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 LVPA: 2 LVPB: 32 LdcEqualsLdd: false @@ -179863,14 +204748,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -179882,7 +204768,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -179927,8 +204813,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1137 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1284 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -179943,11 +204829,11 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -179971,7 +204857,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -179979,11 +204865,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -179991,9 +204877,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 LVPB: 32 @@ -180030,14 +204916,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -180094,8 +204981,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1138 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1285 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -180114,7 +205001,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -180122,7 +205009,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -180136,42 +205023,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 2 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -180185,10 +205068,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -180196,8 +205079,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -180205,6 +205088,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -180215,8 +205099,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -180261,8 +205145,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1139 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1286 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -180271,17 +205155,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -180289,7 +205173,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -180297,7 +205181,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -180305,42 +205189,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 32 + LSCB: 8 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1544 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -180351,11 +205235,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -180363,15 +205247,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -180383,7 +205268,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -180428,8 +205313,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1140 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1287 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB1_PGR1_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -180438,25 +205323,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -180470,44 +205355,40 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 - LVPA: 2 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 520 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -180519,10 +205400,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -180530,13 +205411,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -180547,7 +205431,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -180593,8 +205477,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1141 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1288 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -180603,27 +205487,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -180631,50 +205513,46 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1040 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -180685,11 +205563,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -180697,13 +205575,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -180714,8 +205595,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -180760,8 +205641,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1142 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1289 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -180770,27 +205651,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -180805,41 +205684,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 2 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -180853,9 +205732,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -180864,13 +205743,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -180927,8 +205809,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1143 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1290 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR0_TT2_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -180937,23 +205819,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -180972,58 +205852,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 2 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -181031,13 +205911,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -181049,7 +205932,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -181094,37 +205977,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1144 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1291 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -181132,65 +206013,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -181198,13 +206075,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -181215,7 +206095,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -181261,37 +206141,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1145 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1292 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_AMAS1_EPS0_FL0_GRVW1_GSU8_LPB1_PGR0_PLR1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 1 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -181299,15 +206177,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -181315,49 +206193,45 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -181365,13 +206239,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -181382,7 +206259,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -181428,33 +206305,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1146 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1293 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_AMAS1_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -181466,7 +206341,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -181475,7 +206350,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -181492,39 +206367,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 6784 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -181532,13 +206407,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -181595,20 +206471,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1147 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1294 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -181616,10 +206492,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -181633,7 +206509,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -181641,32 +206517,32 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 2 - LSPB: 32 - LVCA: 128 - LVCB: 8 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -181687,27 +206563,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -181748,6 +206627,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -181764,8 +206644,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1148 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_WG16_16_1_WGM8 + SolutionIndex: 1295 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -181774,21 +206654,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -181800,7 +206680,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -181808,32 +206688,32 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 + LSCA: 64 + LSCB: 16 + LSPA: 16 LSPB: 64 - LVCA: 64 + LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 32 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -181854,27 +206734,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -181886,7 +206769,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -181915,6 +206798,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -181931,8 +206815,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1149 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1296 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -181941,21 +206825,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -181967,7 +206851,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -181975,40 +206859,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -182021,20 +206905,22 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -182042,6 +206928,7 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -182082,6 +206969,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -182098,8 +206986,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1150 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1297 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182108,21 +206996,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -182134,14 +207022,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -182160,14 +207048,14 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -182188,27 +207076,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -182220,7 +207109,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -182249,6 +207138,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -182265,8 +207155,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1151 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1298 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182275,11 +207165,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -182289,7 +207179,9 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -182301,14 +207193,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -182327,14 +207219,14 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -182355,27 +207247,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -182387,7 +207280,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -182416,6 +207309,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -182432,8 +207326,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1152 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1299 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182442,11 +207336,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -182456,7 +207350,9 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -182475,8 +207371,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -182488,28 +207384,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 96 LVCA: 32 LVCB: 2 - LVPA: 2 - LVPB: 32 + LVPA: 4 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -182523,26 +207419,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -182554,7 +207451,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -182583,6 +207480,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -182599,8 +207497,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1153 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1300 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182609,21 +207507,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -182642,8 +207542,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -182655,28 +207555,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 96 LVCA: 32 LVCB: 2 - LVPA: 2 - LVPB: 32 + LVPA: 4 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -182690,26 +207590,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -182750,6 +207651,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -182766,8 +207668,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1154 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1301 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182776,21 +207678,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -182802,15 +207706,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -182822,28 +207726,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 8 - LSPB: 64 + LSPB: 96 LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LVCB: 2 + LVPA: 4 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -182856,27 +207760,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -182917,6 +207822,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -182933,8 +207839,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1155 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1302 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182943,21 +207849,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -182976,9 +207884,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -182998,15 +207906,15 @@ LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 64 LVCA: 32 - LVCB: 2 + LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -183025,23 +207933,28 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -183082,6 +207995,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -183098,8 +208012,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1156 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1303 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -183108,11 +208022,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -183120,11 +208034,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -183143,9 +208055,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -183165,15 +208077,15 @@ LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 64 LVCA: 32 - LVCB: 2 + LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -183192,23 +208104,28 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -183220,7 +208137,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -183249,6 +208166,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -183265,8 +208183,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1157 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1304 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -183275,11 +208193,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -183290,8 +208208,6 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -183309,38 +208225,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -183354,19 +208274,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -183374,6 +208296,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -183384,8 +208307,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -183414,6 +208337,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -183430,8 +208354,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1158 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_WG16_16_1_WGM1 + SolutionIndex: 1305 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -183440,17 +208364,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -183466,7 +208390,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -183474,7 +208398,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -183482,65 +208406,68 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -183581,6 +208508,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -183597,31 +208525,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1159 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 + SolutionIndex: 1306 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -183633,54 +208561,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -183688,24 +208616,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -183746,6 +208679,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -183762,33 +208696,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1160 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 + SolutionIndex: 1307 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -183800,7 +208732,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -183826,53 +208758,56 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -183913,6 +208848,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -183929,31 +208865,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1161 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1308 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -183967,15 +208903,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -183983,65 +208919,66 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184053,7 +208990,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -184082,6 +209019,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -184098,31 +209036,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1162 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM1 + SolutionIndex: 1309 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -184134,15 +209074,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -184154,34 +209094,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3104 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -184189,26 +209129,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184220,7 +209161,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -184249,6 +209190,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -184265,31 +209207,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1163 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1310 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -184301,14 +209245,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -184327,28 +209271,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -184356,26 +209300,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184416,6 +209361,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -184432,20 +209378,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1164 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1311 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -184453,10 +209399,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -184468,16 +209416,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -184494,55 +209442,56 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184583,6 +209532,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -184599,20 +209549,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1165 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1312 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -184620,10 +209570,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -184635,79 +209587,84 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 + LSCB: 16 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184748,6 +209705,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -184764,15 +209722,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1166 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1313 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -184784,13 +209742,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -184802,14 +209758,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -184829,27 +209785,27 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -184858,23 +209814,28 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184915,6 +209876,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -184931,15 +209893,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1167 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1314 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -184952,12 +209914,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -184978,7 +209938,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -184998,15 +209958,15 @@ LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 + LdsNumElements: 6784 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -185016,32 +209976,35 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -185082,6 +210045,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -185098,15 +210062,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1168 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1315 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -185119,7 +210083,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -185136,7 +210100,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -185145,45 +210109,45 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 + LSCA: 64 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 8 + LVCA: 64 LVCB: 8 - LVPA: 32 - LVPB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -185191,26 +210155,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -185251,12 +210218,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -185267,31 +210236,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1169 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_WG4_16_4_WGM1 + SolutionIndex: 1316 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -185303,7 +210272,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -185323,34 +210292,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3104 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -185358,26 +210327,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -185418,12 +210390,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -185434,31 +210408,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1170 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM1 + SolutionIndex: 1317 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -185470,13 +210444,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -185490,61 +210464,60 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -185555,7 +210528,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -185585,12 +210558,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -185601,31 +210576,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1171 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM8 + SolutionIndex: 1318 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -185637,79 +210612,84 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -185750,12 +210730,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -185766,33 +210748,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1172 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 1319 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -185804,15 +210784,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -185824,59 +210804,64 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -185917,12 +210902,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -185933,33 +210920,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1173 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 1320 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -185971,15 +210956,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -185991,61 +210976,60 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 512 - LdsOffsetB_Blk: 4608 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -186056,8 +211040,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -186086,12 +211070,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -186102,14 +211088,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1174 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR0_TT4_4_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 1321 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -186122,11 +211108,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -186138,7 +211124,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -186146,73 +211132,76 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 64 LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -186253,12 +211242,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -186269,31 +211260,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1175 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 1322 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -186305,54 +211296,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 8 LSPA: 8 - LSPB: 8 + LSPB: 64 LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3088 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -186360,26 +211351,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -186391,7 +211383,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -186420,12 +211412,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -186436,31 +211430,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1176 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO1_WG16_4_4_WGM8 + SolutionIndex: 1323 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -186472,81 +211468,82 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -186587,12 +211584,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -186603,31 +211602,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1177 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_WG16_4_4_WGM8 + SolutionIndex: 1324 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -186639,81 +211640,82 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -186725,7 +211727,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -186754,12 +211756,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -186770,31 +211774,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1178 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 1325 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -186806,7 +211812,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -186814,7 +211820,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -186822,65 +211828,68 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -186892,7 +211901,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -186921,12 +211930,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -186937,31 +211948,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1179 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 1326 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -186979,7 +211990,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -186993,7 +212004,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -187008,11 +212019,15 @@ LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -187025,24 +212040,26 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -187055,8 +212072,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -187085,12 +212102,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -187101,8 +212120,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1180 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 1327 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -187111,19 +212130,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -187137,7 +212156,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -187164,19 +212183,19 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 + LSCB: 16 LSPA: 4 - LSPB: 32 + LSPB: 16 LVCA: 64 - LVCB: 8 + LVCB: 16 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3136 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -187187,26 +212206,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -187220,7 +212241,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -187249,12 +212270,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -187265,8 +212288,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1181 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1328 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -187275,11 +212298,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -187287,9 +212310,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -187301,7 +212324,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -187309,7 +212332,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -187321,28 +212344,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -187355,20 +212378,22 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -187417,12 +212442,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -187433,8 +212460,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1182 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1329 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -187443,21 +212470,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -187469,7 +212496,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -187477,36 +212504,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 4 LSPB: 64 - LVCA: 16 + LVCA: 64 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -187523,25 +212550,27 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -187585,12 +212614,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -187601,8 +212632,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1183 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1330 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -187611,21 +212642,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -187637,14 +212668,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -187663,22 +212694,18 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -187691,24 +212718,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -187721,7 +212752,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -187751,12 +212782,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -187767,8 +212800,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1184 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1331 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -187777,10 +212810,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -187791,9 +212824,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -187805,16 +212836,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -187831,18 +212862,18 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -187859,20 +212890,24 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -187890,7 +212925,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -187919,12 +212954,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -187935,8 +212972,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1185 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1332 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -187945,10 +212982,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -187959,9 +212996,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -187973,7 +213008,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -187981,7 +213016,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -187989,32 +213024,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -188027,26 +213062,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -188089,12 +213126,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -188105,8 +213144,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1186 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1333 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -188115,21 +213154,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -188148,8 +213187,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -188157,32 +213196,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 128 LSCB: 8 - LSPA: 5 + LSPA: 8 LSPB: 64 - LVCA: 48 + LVCA: 32 LVCB: 4 - LVPA: 3 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -188196,22 +213235,26 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -188255,12 +213298,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -188271,8 +213316,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1187 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1334 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -188281,23 +213326,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -188317,7 +213360,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -188325,11 +213368,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -188337,22 +213380,22 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -188371,17 +213414,19 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -188396,7 +213441,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -188425,12 +213470,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -188441,8 +213488,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1188 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1335 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -188461,9 +213508,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -188512,15 +213559,15 @@ LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -188539,7 +213586,9 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -188593,12 +213642,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -188609,8 +213660,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1189 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1336 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -188645,7 +213696,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -188654,7 +213705,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -188672,12 +213723,184 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 + LSCB: 16 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1337 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false LdsNumElements: 3584 @@ -188707,17 +213930,17 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -188761,12 +213984,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -188777,8 +214002,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1190 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1338 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -188797,11 +214022,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -188813,48 +214040,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 - LSPA: 8 + LSCB: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -188867,7 +214094,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -188875,14 +214102,14 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -188929,12 +214156,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -188945,8 +214174,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1191 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1339 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -188965,11 +214194,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -189043,7 +214274,9 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -189095,12 +214328,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -189111,8 +214346,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1192 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1340 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -189211,7 +214446,9 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -189263,12 +214500,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -189279,8 +214518,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1193 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1341 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -189323,40 +214562,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -189368,21 +214611,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -189399,8 +214642,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -189429,12 +214672,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -189445,8 +214690,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1194 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 1342 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -189455,21 +214700,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -189487,7 +214734,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -189501,26 +214748,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 2 LSPB: 32 - LVCA: 64 + LVCA: 128 LVCB: 8 - LVPA: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 784 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -189532,25 +214783,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -189563,8 +214816,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -189593,12 +214846,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -189609,8 +214864,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1195 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR0_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 1343 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -189619,19 +214874,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -189645,23 +214900,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -189671,24 +214926,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 4 LSPB: 64 - LVCA: 16 + LVCA: 64 LVCB: 4 - LVPA: 8 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -189699,24 +214954,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -189759,12 +215018,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -189775,8 +215036,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1196 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1344 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -189785,11 +215046,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -189799,9 +215060,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -189813,46 +215072,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 128 + LSCB: 8 LSPA: 4 - LSPB: 16 + LSPB: 64 LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -189863,26 +215126,26 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -189895,8 +215158,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -189925,12 +215188,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -189941,8 +215206,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1197 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 1345 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -189951,13 +215216,13 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -189965,7 +215230,9 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -189977,46 +215244,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -190027,26 +215298,26 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -190059,7 +215330,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -190089,12 +215360,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -190105,8 +215378,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1198 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1346 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -190115,21 +215388,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -190141,16 +215416,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -190168,17 +215443,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCB: 8 + LSPA: 8 LSPB: 32 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4224 + LdsNumElements: 1824 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -190191,27 +215470,29 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -190223,8 +215504,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -190253,12 +215534,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -190269,15 +215552,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1199 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1347 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -190290,10 +215573,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -190305,16 +215588,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -190332,17 +215615,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 16 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 1824 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -190355,15 +215642,17 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -190373,7 +215662,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -190385,8 +215674,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -190415,12 +215704,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -190431,15 +215722,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1200 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1348 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -190452,10 +215743,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -190469,50 +215760,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 800 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -190520,26 +215811,26 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -190581,12 +215872,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -190597,8 +215890,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1201 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_8_2_WGM8 + SolutionIndex: 1349 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -190607,21 +215900,23 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -190633,7 +215928,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -190641,46 +215936,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 LVCB: 8 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1680 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 192 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -190689,25 +215984,27 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -190749,12 +216046,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -190765,8 +216064,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1202 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + SolutionIndex: 1350 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -190775,21 +216074,21 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -190801,16 +216100,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -190828,27 +216127,27 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 + LSCB: 8 + LSPA: 8 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 4 + LVCB: 2 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -190857,23 +216156,27 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -190886,7 +216189,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -190915,12 +216218,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -190931,8 +216236,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1203 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + SolutionIndex: 1351 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -190941,23 +216246,21 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -190969,54 +216272,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 + LSCA: 64 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 32 - LVPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -191024,26 +216327,26 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -191056,7 +216359,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -191085,12 +216388,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -191101,31 +216406,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1204 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW4_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 1352 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -191137,13 +216444,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 - DirectToLds: false - DirectToLdsA: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -191157,61 +216464,59 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 1 + LSPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 1296 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -191223,7 +216528,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -191253,12 +216558,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -191269,31 +216576,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1205 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + SolutionIndex: 1353 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB2_PGR0_PLR1_TT8_4_USFGRO1_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -191305,79 +216612,79 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 - DirectToLds: false - DirectToLdsA: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 1 LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1312 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -191389,7 +216696,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -191419,12 +216726,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -191435,33 +216744,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1206 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 1354 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -191473,81 +216780,79 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 - DirectToLds: false - DirectToLdsA: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 1 LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 1312 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -191559,7 +216864,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -191589,12 +216894,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -191605,35 +216912,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1207 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 1355 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -191641,7 +216948,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -191649,36 +216956,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 4 + LSCA: 32 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 64 + LVCA: 16 LVCB: 8 - LVPA: 4 - LVPB: 32 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 2560 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -191691,25 +216998,27 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 NumLoadsB: 2 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -191753,12 +217062,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -191769,8 +217080,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1208 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1356 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR0_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -191779,21 +217090,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -191805,14 +217116,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -191821,7 +217132,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -191831,22 +217142,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -191859,24 +217170,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -191919,12 +217234,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -191935,8 +217252,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1209 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1357 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -191945,10 +217262,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -191957,15 +217274,13 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -191980,41 +217295,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 96 LSCB: 8 - LSPA: 2 - LSPB: 32 - LVCA: 128 - LVCB: 8 - LVPA: 2 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -192028,25 +217343,25 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -192089,12 +217404,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -192105,8 +217422,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1210 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1358 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -192115,21 +217432,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -192148,7 +217467,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -192167,22 +217486,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 96 LSCB: 8 - LSPA: 4 + LSPA: 5 LSPB: 64 - LVCA: 64 + LVCA: 48 LVCB: 4 - LVPA: 2 + LVPA: 3 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -192196,21 +217515,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -192257,12 +217576,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -192273,8 +217594,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1211 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1359 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -192283,10 +217604,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -192295,13 +217616,15 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -192315,38 +217638,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -192360,25 +217687,25 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -192391,8 +217718,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -192421,12 +217748,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -192437,8 +217766,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1212 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1360 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -192447,25 +217776,27 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 32 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -192480,43 +217811,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 96 LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 1544 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 320 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -192528,21 +217859,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -192560,7 +217891,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -192589,12 +217920,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -192605,8 +217938,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1213 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB1_PGR1_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1361 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -192615,25 +217948,27 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -192641,50 +217976,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 32 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 520 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -192692,19 +218031,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -192723,8 +218064,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -192753,12 +218094,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -192769,35 +218112,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1214 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1362 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -192811,70 +218154,74 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1040 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -192887,8 +218234,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -192917,12 +218264,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -192933,35 +218282,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1215 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1363 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -192969,7 +218320,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -192995,28 +218346,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 3392 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -193024,25 +218375,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -193056,7 +218409,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -193085,12 +218438,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -193101,31 +218456,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1216 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR0_TT2_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1364 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -193137,7 +218492,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -193146,71 +218501,73 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3392 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -193253,12 +218610,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -193269,35 +218628,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1217 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM8 + SolutionIndex: 1365 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -193311,18 +218670,18 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -193331,50 +218690,56 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 8 LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -193387,7 +218752,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -193417,12 +218782,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -193433,35 +218800,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1218 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_AMAS1_EPS0_FL0_GRVW1_GSU8_LPB1_PGR0_PLR1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM1 + SolutionIndex: 1366 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -193469,23 +218836,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -193495,20 +218862,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -193519,20 +218890,22 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -193551,7 +218924,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -193581,12 +218954,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -193597,8 +218972,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1219 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_AMAS1_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM1 + SolutionIndex: 1367 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -193607,21 +218982,21 @@ SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -193640,43 +219015,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 4 + LVCB: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -193688,23 +219063,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -193747,12 +219126,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -193763,33 +219144,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1220 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 1368 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 4, 4] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -193801,65 +219180,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -193869,14 +219248,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -193926,6 +219303,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -193936,31 +219314,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1221 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1369 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -193972,15 +219352,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -193992,45 +219372,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -194040,10 +219420,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -194097,6 +219475,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -194107,31 +219486,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1222 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1370 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -194143,7 +219524,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -194151,56 +219532,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -194211,13 +219592,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -194268,6 +219649,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -194278,14 +219660,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1223 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1371 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW2_GSU8_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -194294,15 +219676,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -194314,65 +219696,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 8 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3392 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -194382,11 +219764,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -194437,6 +219821,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -194447,33 +219832,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1224 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1372 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -194485,65 +219868,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 8 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3392 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -194553,11 +219936,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -194608,6 +219993,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -194618,33 +220004,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1225 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1373 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -194656,7 +220040,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -194665,45 +220049,45 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 96 - LVCA: 32 - LVCB: 2 - LVPA: 4 - LVPB: 24 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -194711,10 +220095,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -194724,11 +220108,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -194743,7 +220127,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -194779,6 +220163,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -194789,31 +220174,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1226 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1374 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -194827,23 +220212,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -194853,28 +220238,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 96 - LVCA: 32 - LVCB: 2 - LVPA: 4 - LVPB: 24 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -194882,10 +220267,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -194895,11 +220280,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -194950,6 +220337,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -194960,33 +220348,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1227 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1375 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -194998,7 +220384,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -195007,14 +220393,14 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -195024,28 +220410,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 96 - LVCA: 32 - LVCB: 2 - LVPA: 4 - LVPB: 24 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -195053,10 +220439,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -195066,11 +220452,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -195121,6 +220507,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -195131,31 +220518,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1228 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1376 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -195169,7 +220556,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -195177,57 +220564,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -195237,13 +220624,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -195258,7 +220645,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -195294,6 +220681,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -195304,31 +220692,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1229 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1377 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -195340,7 +220728,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -195349,7 +220737,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -195366,39 +220754,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -195408,8 +220796,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -195429,7 +220817,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -195465,6 +220853,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -195475,20 +220864,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1230 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1378 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -195496,10 +220885,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -195511,7 +220900,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -195537,28 +220926,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -195566,10 +220955,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -195579,13 +220968,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -195636,6 +221025,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -195646,20 +221036,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1231 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1379 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -195667,10 +221057,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -195682,14 +221072,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -195708,28 +221098,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -195737,10 +221127,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -195750,13 +221140,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -195807,6 +221195,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -195817,20 +221206,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1232 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1380 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -195838,10 +221227,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -195853,16 +221244,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -195879,39 +221270,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 6784 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -195921,10 +221312,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -195978,6 +221367,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -195988,20 +221378,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1233 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1381 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -196009,10 +221399,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -196024,7 +221416,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -196050,28 +221442,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -196079,10 +221471,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -196092,8 +221484,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -196147,30 +221539,33 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1234 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1382 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -196178,10 +221573,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -196195,7 +221590,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -196203,7 +221598,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -196211,38 +221606,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -196250,10 +221645,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -196263,11 +221658,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -196282,7 +221677,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -196318,41 +221713,44 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1235 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1383 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -196366,7 +221764,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -196392,28 +221790,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -196421,10 +221819,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -196434,11 +221832,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -196453,7 +221851,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -196489,30 +221887,33 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1236 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1384 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -196520,10 +221921,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -196544,7 +221945,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -196607,6 +222008,8 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -196624,7 +222027,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -196660,18 +222063,21 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1237 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1385 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -196692,11 +222098,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -196715,7 +222119,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -196778,6 +222182,8 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -196831,18 +222237,21 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1238 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1386 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -196863,11 +222272,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -196888,7 +222295,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -196905,39 +222312,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -196947,13 +222354,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -197004,30 +222411,33 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1239 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 1387 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -197035,7 +222445,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -197050,14 +222460,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -197076,28 +222486,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -197105,10 +222515,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -197118,13 +222528,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -197139,7 +222547,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -197175,30 +222583,33 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1240 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + SolutionIndex: 1388 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -197206,10 +222617,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -197221,7 +222634,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -197247,28 +222660,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -197276,10 +222689,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -197289,11 +222702,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -197344,30 +222757,33 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1241 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + SolutionIndex: 1389 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -197375,10 +222791,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - [2, 3, 0, 1] @@ -199782,36 +225198,18 @@ - [101, 4291.65] - - [3136, 64, 128, 64] - [183, 8175.06] - - - [784, 512, 64, 128] - - [181, 8378.34] - - - [3136, 256, 64, 64] - - [184, 8506.65] - - - [12544, 1024, 1, 256] - - [177, 8927.93] - - [784, 128, 128, 512] - [182, 8190.53] - - [784, 512, 256, 128] - [180, 8637.14] - - - [3136, 64, 64, 256] - - [179, 8782.93] - - - [3136, 512, 1, 2048] - - [176, 7298.32] - - - [12544, 256, 1, 1024] - - [188, 7667.25] - - - [3136, 2048, 1, 512] - - [187, 8447.22] - - [3136, 256, 256, 64] - [180, 8663.08] - - [3136, 64, 128, 256] - [178, 8943.46] - - - [784, 128, 64, 512] - - [186, 8006.27] - - [3136, 64, 256, 64] - [183, 8267.12] - - [784, 512, 128, 128] - [180, 8564.25] - - - [3136, 64, 64, 64] - - [183, 8009.35] - - [784, 128, 256, 512] - [184, 8377.06] - - [3136, 64, 256, 256] @@ -199896,8 +225294,6 @@ - [213, 4032.98] - - [1024, 256, 1, 4096] - [201, 7326.3] - - - [4096, 512, 1, 4096] - - [193, 9471.97] - - [1024, 200, 1, 2048] - [194, 5530.46] - - [2048, 1024, 1, 512] @@ -203614,6236 +229010,6688 @@ - [608, 9352.16] - - [256, 8976, 1, 44505] - [612, 8430.23] + - - [6272, 256, 1, 528] + - [664, 7389.94] + - - [3136, 2048, 1, 1024] + - [645, 9657.94] + - - [6272, 112, 1, 512] + - [643, 5931.09] + - - [2048, 320, 1, 1280] + - [663, 7772.99] + - - [289, 256, 1, 1568] + - [684, 3718.17] + - - [3136, 64, 64, 64] + - [623, 8201.15] + - - [50176, 128, 1, 256] + - [646, 8908.58] + - - [5329, 64, 1, 448] + - [629, 4602.2] + - - [289, 192, 1, 1344] + - [681, 3452.59] + - - [12544, 1024, 1, 256] + - [646, 9742.64] + - - [784, 64, 32, 192] + - [622, 6844.61] + - - [6272, 64, 1, 480] + - [630, 5562.24] + - - [196, 128, 1, 800] + - [672, 1639.74] + - - [64, 512, 1, 1344] + - [671, 2313.04] + - - [6272, 64, 1, 512] + - [629, 5609.19] + - - [6272, 160, 1, 528] + - [630, 6149.7] + - - [289, 160, 32, 768] + - [657, 6637.82] + - - [12544, 256, 1, 1024] + - [664, 8790.46] + - - [289, 224, 1, 1568] + - [684, 3270.17] + - - [5329, 64, 32, 160] + - [637, 9091.04] + - - [5329, 96, 1, 576] + - [664, 5555.66] + - - [3025, 64, 1, 363] + - [682, 4392.3] + - - [784, 32, 32, 192] + - [653, 5633.8] + - - [3136, 512, 1, 1024] + - [649, 7553.14] + - - [6272, 16, 1, 480] + - [684, 3219.85] + - - [1225, 64, 32, 288] + - [644, 8240.58] + - - [64, 256, 1, 1536] + - [677, 1456.36] + - - [289, 192, 32, 768] + - [656, 7372.8] + - - [2048, 448, 1, 1280] + - [639, 8403.01] + - - [3136, 2048, 1, 512] + - [638, 9486.31] + - - [289, 256, 1, 2016] + - [684, 3876.08] + - - [289, 384, 32, 1024] + - [623, 7350.54] + - - [1568, 32, 1, 832] + - [673, 2717.87] + - - [3136, 64, 32, 64] + - [626, 7657.26] + - - [289, 160, 1, 1120] + - [680, 2826.9] + - - [6272, 128, 1, 528] + - [634, 6926.26] + - - [21609, 32, 1, 288] + - [635, 3698.9] + - - [1225, 192, 1, 1728] + - [668, 7309.81] + - - [4096, 512, 1, 4096] + - [651, 10272.1] + - - [64, 256, 1, 1152] + - [677, 1387.82] + - - [6272, 96, 1, 480] + - [665, 6371.56] + - - [784, 96, 1, 800] + - [685, 3330.27] + - - [2048, 448, 1, 2048] + - [639, 8622.65] + - - [784, 96, 32, 192] + - [654, 7092.36] + - - [3136, 64, 64, 256] + - [647, 9579.16] + - - [289, 224, 1, 1344] + - [684, 3180.01] + - - [1001, 512, 1, 4096] + - [625, 8195.07] + - - [2048, 192, 1, 1280] + - [630, 6120.09] + - - [1225, 64, 32, 256] + - [635, 8076.62] + - - [2048, 256, 1, 1536] + - [625, 8137.7] + - - [1225, 64, 1, 1200] + - [684, 3552.87] + - - [6272, 128, 1, 512] + - [638, 6878.21] + - - [729, 192, 1, 1600] + - [683, 5016.77] + - - [289, 192, 1, 896] + - [681, 3091.87] + - - [1568, 384, 1, 832] + - [664, 6934.62] + - - [784, 16, 32, 192] + - [655, 3380.28] + - - [1568, 256, 1, 832] + - [629, 5980.86] + - - [1568, 48, 1, 832] + - [686, 3275.09] + - - [1568, 192, 1, 832] + - [624, 4441.11] + - - [289, 192, 32, 1024] + - [627, 6563.06] + - - [6272, 32, 1, 528] + - [668, 4998.67] + - - [49, 128, 1, 1200] + - [669, 550.175] + - - [1225, 64, 32, 384] + - [641, 8589.33] + - - [289, 128, 1, 896] + - [680, 2103.1] + - - [1568, 160, 1, 832] + - [668, 6995.05] + - - [1001, 32, 1, 1024] + - [677, 1744.72] + - - [2048, 320, 1, 2048] + - [662, 7118.04] + - - [2048, 384, 1, 1536] + - [625, 8184.01] + - - [50176, 512, 1, 256] + - [637, 9852.4] + - - [289, 256, 1, 1792] + - [686, 3809.75] + - - [64, 448, 1, 1152] + - [678, 2128.23] + - - [5041, 96, 1, 576] + - [663, 5279.3] + - - [6272, 192, 1, 480] + - [625, 7479.65] + - - [784, 32, 32, 256] + - [652, 5708.91] + - - [1001, 32, 1, 2048] + - [679, 2141.04] + - - [289, 192, 1, 1120] + - [675, 3277.77] + - - [6272, 32, 1, 512] + - [667, 4978.7] + - - [289, 384, 1, 3456] + - [684, 5904.14] + - - [289, 384, 1, 2592] + - [685, 5707.34] + - - [784, 128, 64, 512] + - [631, 8864.39] + - - [12544, 1024, 1, 512] + - [646, 10008.3] + - - [12544, 256, 1, 512] + - [664, 8628.08] + - - [6272, 24, 1, 512] + - [668, 3568.07] + - - [5041, 192, 1, 720] + - [639, 8424.42] + - - [64, 320, 1, 1728] + - [672, 1469.66] + - - [784, 128, 32, 256] + - [640, 8104.14] + - - [289, 96, 1, 864] + - [678, 1838.25] + - - [1225, 32, 32, 192] + - [659, 5949.72] + - - [1568, 128, 1, 832] + - [667, 5718.69] + - - [289, 128, 32, 768] + - [625, 7289.25] + - - [3136, 256, 64, 64] + - [633, 9103.92] + - - [196, 64, 1, 800] + - [671, 915.62] + - - [4096, 512, 1, 9216] + - [648, 10351.4] + - - [12544, 64, 1, 147] + - [638, 5069.33] + - - [784, 32, 1, 400] + - [669, 1140.36] + - - [6272, 160, 1, 512] + - [629, 6140.08] + - - [1225, 48, 32, 288] + - [635, 5978.61] + - - [64, 320, 1, 2880] + - [676, 1920.0] + - - [1225, 64, 32, 192] + - [629, 7641.01] + - - [1001, 32, 1, 1536] + - [677, 2084.79] + - - [784, 64, 32, 256] + - [621, 6990.51] + - - [64, 384, 1, 1152] + - [678, 1862.6] + - - [784, 512, 64, 128] + - [632, 9025.95] + - - [3136, 512, 1, 2048] + - [650, 7764.3] + - - [6272, 144, 1, 512] + - [625, 5574.04] + - - [1225, 192, 32, 384] + - [639, 9373.83] + - - [64, 192, 1, 1728] + - [677, 1206.46] + - - [8192, 320, 1, 1280] + - [691, 9875.92] + - - [8192, 320, 1, 2048] + - [694, 9745.7] + - - [8192, 384, 1, 1280] + - [691, 10046.2] + - - [8192, 192, 1, 1280] + - [694, 9950.9] + - - [8192, 192, 1, 2048] + - [690, 9559.67] + - - [8192, 384, 1, 2048] + - [692, 9945.74] + - - [8192, 448, 1, 2048] + - [693, 9908.51] + - - [1001, 64, 1, 1536] + - [687, 3649.94] + - - [8192, 448, 1, 1280] + - [691, 9981.35] + - - [1001, 64, 1, 2048] + - [688, 3580.87] + - - [1001, 128, 1, 2048] + - [689, 5587.87] - - [704, 1024, 1, 128] - - [723, 3019.56] + - [797, 3019.56] - - [1024, 1024, 1, 3328] - - [761, 8162.65] + - [835, 8162.65] - - [4, 704, 1, 1280] - - [664, 319.646] + - [738, 319.646] - - [4, 1856, 1, 3328] - - [694, 550.614] + - [768, 550.614] - - [1856, 448, 1, 3328] - - [746, 6813.15] + - [820, 6813.15] - - [2944, 4288, 1, 1280] - - [755, 8975.86] + - [829, 8975.86] - - [2368, 64, 1, 3328] - - [669, 5482.33] + - [743, 5482.33] - - [1760, 32, 1, 1760] - - [708, 3860.04] + - [782, 3860.04] - - [2368, 5888, 1, 256] - - [752, 8656.83] + - [826, 8656.83] - - [5888, 1856, 1, 256] - - [742, 7881.53] + - [816, 7881.53] - - [64, 3584, 1, 1280] - - [678, 4835.43] + - [752, 4835.43] - - [512, 24000, 1, 1536] - - [749, 8666.0] + - [823, 8666.0] - - [128, 6784, 1, 3328] - - [746, 7062.35] + - [820, 7062.35] - - [5888, 1408, 1, 256] - - [759, 8130.32] + - [833, 8130.32] - - [5888, 1856, 1, 3328] - - [749, 8840.85] + - [823, 8840.85] - - [512, 4, 1, 512] - - [634, 170.323] + - [708, 170.323] - - [35, 1500, 1, 2560] - - [638, 2896.65] + - [712, 2896.65] - - [1856, 4288, 1, 256] - - [738, 8374.73] + - [812, 8374.73] - - [1024, 5056, 1, 128] - - [735, 3304.35] + - [809, 3304.35] - - [5056, 5056, 1, 3328] - - [749, 8905.53] + - [823, 8905.53] - - [1408, 5888, 1, 1280] - - [749, 9418.2] + - [823, 9418.2] - - [2368, 448, 1, 128] - - [723, 3075.07] + - [797, 3075.07] - - [6144, 6000, 1, 2560] - - [749, 9336.43] + - [823, 9336.43] - - [2368, 6784, 1, 128] - - [722, 4919.36] + - [796, 4919.36] - - [1024, 3584, 1, 3328] - - [740, 8071.17] + - [814, 8071.17] - - [512, 48000, 1, 2048] - - [749, 8763.16] + - [823, 8763.16] - - [1408, 64, 1, 128] - - [645, 805.57] + - [719, 805.57] - - [256, 4288, 1, 3328] - - [771, 6331.96] + - [845, 6331.96] - - [5888, 1408, 1, 1280] - - [739, 9226.27] + - [813, 9226.27] - - [704, 1856, 1, 3328] - - [765, 6309.5] + - [839, 6309.5] - - [1408, 4288, 1, 256] - - [749, 8374.6] + - [823, 8374.6] - - [1024, 2368, 1, 256] - - [746, 7341.12] + - [820, 7341.12] - - [64, 4, 1, 256] - - [689, 13.1032] + - [763, 13.1032] - - [1408, 1856, 1, 1280] - - [756, 8773.05] + - [830, 8773.05] - - [1408, 64, 1, 1280] - - [702, 4050.08] + - [776, 4050.08] - - [448, 1024, 1, 1280] - - [765, 6071.26] + - [839, 6071.26] - - [4096, 32, 1, 4096] - - [699, 5491.82] + - [773, 5491.82] - - [256, 1408, 1, 3328] - - [751, 5351.49] + - [825, 5351.49] - - [5056, 5056, 1, 1280] - - [759, 9408.67] + - [833, 9408.67] - - [448, 5056, 1, 256] - - [764, 6680.54] + - [838, 6680.54] - - [704, 1856, 1, 1280] - - [741, 7504.03] + - [815, 7504.03] - - [128, 5056, 1, 128] - - [656, 2316.58] + - [730, 2316.58] - - [2368, 128, 1, 256] - - [741, 3660.22] + - [815, 3660.22] - - [1856, 1408, 1, 128] - - [728, 3885.97] + - [802, 3885.97] - - [64, 5056, 1, 256] - - [751, 3318.91] + - [825, 3318.91] - - [6784, 256, 1, 3328] - - [749, 7590.64] + - [823, 7590.64] - - [1408, 3584, 1, 256] - - [738, 8276.92] + - [812, 8276.92] - - [4288, 448, 1, 256] - - [751, 7139.79] + - [825, 7139.79] - - [64, 704, 1, 128] - - [652, 375.567] + - [726, 375.567] - - [1024, 1856, 1, 128] - - [721, 2890.66] + - [795, 2890.66] - - [4288, 2944, 1, 1280] - - [755, 8981.45] + - [829, 8981.45] - - [704, 5056, 1, 1280] - - [741, 7684.72] + - [815, 7684.72] - - [2368, 704, 1, 3328] - - [756, 7070.14] + - [830, 7070.14] - - [256, 5888, 1, 256] - - [741, 7319.45] + - [815, 7319.45] - - [1856, 4288, 1, 3328] - - [739, 9238.69] + - [813, 9238.69] - - [256, 2944, 1, 256] - - [741, 6090.31] + - [815, 6090.31] - - [5888, 1024, 1, 256] - - [745, 8270.05] + - [819, 8270.05] - - [448, 64, 1, 1280] - - [698, 2493.32] + - [772, 2493.32] - - [3072, 64, 1, 1024] - - [681, 3149.77] + - [755, 3149.77] - - [3584, 4, 1, 1280] - - [783, 567.862] + - [857, 567.862] - - [2560, 16, 1, 2560] - - [690, 2887.15] + - [764, 2887.15] - - [2944, 64, 1, 256] - - [681, 2565.76] + - [755, 2565.76] - - [128, 4, 1, 1280] - - [784, 78.8692] + - [858, 78.8692] - - [1408, 2944, 1, 256] - - [745, 8337.3] + - [819, 8337.3] - - [256, 1856, 1, 1280] - - [771, 6267.35] + - [845, 6267.35] - - [6784, 5056, 1, 3328] - - [755, 9424.0] + - [829, 9424.0] - - [5056, 5056, 1, 256] - - [742, 8758.33] + - [816, 8758.33] - - [128, 256, 1, 256] - - [697, 1205.36] + - [771, 1205.36] - - [64, 1024, 1, 1280] - - [708, 3566.68] + - [782, 3566.68] - - [2944, 4, 1, 256] - - [661, 319.449] + - [735, 319.449] - - [704, 5056, 1, 128] - - [730, 4073.83] + - [804, 4073.83] - - [4, 2368, 1, 1280] - - [689, 496.992] + - [763, 496.992] - - [2368, 2944, 1, 1280] - - [738, 9085.55] + - [812, 9085.55] - - [448, 448, 1, 3328] - - [716, 5428.76] + - [790, 5428.76] - - [6784, 6784, 1, 1280] - - [755, 8727.03] + - [829, 8727.03] - - [1024, 256, 1, 3328] - - [765, 5499.42] + - [839, 5499.42] - - [1408, 4288, 1, 1280] - - [739, 9094.42] + - [813, 9094.42] - - [3584, 4288, 1, 1280] - - [742, 8703.88] + - [816, 8703.88] - - [512, 6000, 1, 2560] - - [745, 8474.56] + - [819, 8474.56] - - [2368, 704, 1, 1280] - - [751, 7651.59] + - [825, 7651.59] - - [5056, 4288, 1, 3328] - - [759, 8545.35] + - [833, 8545.35] - - [3584, 2368, 1, 3328] - - [747, 8797.88] + - [821, 8797.88] - - [5888, 6784, 1, 1280] - - [745, 8785.18] + - [819, 8785.18] - - [64, 704, 1, 1280] - - [668, 2783.48] + - [742, 2783.48] - - [4288, 256, 1, 256] - - [741, 6162.78] + - [815, 6162.78] - - [2944, 128, 1, 128] - - [643, 1951.33] + - [717, 1951.33] - - [6144, 32, 1, 2560] - - [702, 4589.05] + - [776, 4589.05] - - [6784, 448, 1, 1280] - - [746, 8674.31] + - [820, 8674.31] - - [2944, 5888, 1, 256] - - [759, 8991.76] + - [833, 8991.76] - - [64, 64, 1, 1280] - - [719, 712.448] + - [793, 712.448] - - [4288, 2944, 1, 256] - - [755, 8678.14] + - [829, 8678.14] - - [5888, 704, 1, 1280] - - [745, 8652.71] + - [819, 8652.71] - - [5056, 4, 1, 3328] - - [661, 650.772] + - [735, 650.772] - - [1856, 64, 1, 1280] - - [678, 4471.97] + - [752, 4471.97] - - [1760, 16, 1, 1760] - - [718, 2592.23] + - [792, 2592.23] - - [448, 5888, 1, 128] - - [728, 3823.03] + - [802, 3823.03] - - [5888, 64, 1, 3328] - - [710, 6013.22] + - [784, 6013.22] - - [2944, 256, 1, 3328] - - [751, 7791.45] + - [825, 7791.45] - - [1024, 64, 1, 128] - - [652, 592.516] + - [726, 592.516] - - [5056, 2368, 1, 1280] - - [738, 9260.53] + - [812, 9260.53] - - [448, 3584, 1, 1280] - - [759, 6771.34] + - [833, 6771.34] - - [6784, 5888, 1, 256] - - [753, 7933.39] + - [827, 7933.39] - - [64, 1024, 1, 3328] - - [702, 4783.08] + - [776, 4783.08] - - [704, 128, 1, 1280] - - [708, 3971.98] + - [782, 3971.98] - - [4, 3584, 1, 128] - - [777, 59.5238] + - [851, 59.5238] - - [1408, 448, 1, 1280] - - [751, 5902.17] + - [825, 5902.17] - - [1024, 1408, 1, 256] - - [746, 5272.94] + - [820, 5272.94] - - [2368, 2368, 1, 3328] - - [751, 8488.76] + - [825, 8488.76] - - [1856, 6784, 1, 128] - - [728, 4742.51] + - [802, 4742.51] - - [5056, 704, 1, 3328] - - [754, 7772.48] + - [828, 7772.48] - - [1408, 1856, 1, 256] - - [772, 5229.84] + - [846, 5229.84] - - [1408, 704, 1, 3328] - - [772, 6954.93] + - [846, 6954.93] - - [2368, 5056, 1, 256] - - [745, 8580.68] + - [819, 8580.68] - - [1408, 256, 1, 1280] - - [771, 4790.11] + - [845, 4790.11] - - [3072, 128, 1, 1024] - - [767, 4579.87] + - [841, 4579.87] - - [3584, 2368, 1, 1280] - - [738, 8675.13] + - [812, 8675.13] - - [4288, 64, 1, 3328] - - [717, 5550.11] + - [791, 5550.11] - - [2368, 4, 1, 1280] - - [783, 537.518] + - [857, 537.518] - - [704, 5888, 1, 256] - - [739, 5305.88] + - [813, 5305.88] - - [6784, 2944, 1, 128] - - [735, 4344.21] + - [809, 4344.21] - - [6784, 64, 1, 256] - - [765, 4496.42] + - [839, 4496.42] - - [2944, 256, 1, 256] - - [751, 6553.7] + - [825, 6553.7] - - [2944, 6784, 1, 3328] - - [739, 8895.76] + - [813, 8895.76] - - [128, 1, 1, 1408] - - [719, 25.7] + - [793, 25.7] - - [704, 1408, 1, 3328] - - [753, 7913.21] + - [827, 7913.21] - - [3584, 704, 1, 3328] - - [738, 7526.43] + - [812, 7526.43] - - [2944, 256, 1, 128] - - [722, 2830.76] + - [796, 2830.76] - - [6784, 4, 1, 1280] - - [779, 645.235] + - [853, 645.235] - - [1024, 64, 1, 1280] - - [677, 3013.25] + - [751, 3013.25] - - [8448, 4, 1, 2816] - - [629, 984.768] + - [703, 984.768] - - [448, 4288, 1, 256] - - [751, 7139.79] + - [825, 7139.79] - - [64, 3584, 1, 3328] - - [675, 5683.27] + - [749, 5683.27] - - [704, 2368, 1, 1280] - - [759, 7045.3] + - [833, 7045.3] - - [1856, 2368, 1, 1280] - - [756, 8327.9] + - [830, 8327.9] - - [2368, 128, 1, 3328] - - [692, 6082.65] + - [766, 6082.65] - - [64, 193600, 1, 64] - - [741, 6747.77] + - [815, 6747.77] - - [1760, 128, 1, 1760] - - [669, 5513.07] + - [743, 5513.07] - - [448, 1408, 1, 256] - - [751, 5591.54] + - [825, 5591.54] - - [1856, 4288, 1, 1280] - - [749, 8647.72] + - [823, 8647.72] - - [64, 5056, 1, 3328] - - [709, 6096.59] + - [783, 6096.59] - - [512, 1500, 1, 2816] - - [751, 7879.3] + - [825, 7879.3] - - [1024, 448, 1, 128] - - [723, 1844.33] + - [797, 1844.33] - - [704, 4, 1, 1280] - - [689, 341.433] + - [763, 341.433] - - [704, 256, 1, 128] - - [723, 1001.34] + - [797, 1001.34] - - [256, 193600, 1, 64] - - [759, 8113.3] + - [833, 8113.3] - - [704, 2944, 1, 128] - - [730, 3747.13] + - [804, 3747.13] - - [1408, 1024, 1, 1280] - - [756, 7080.71] + - [830, 7080.71] - - [704, 6784, 1, 256] - - [774, 6630.47] + - [848, 6630.47] - - [6784, 704, 1, 256] - - [741, 8005.86] + - [815, 8005.86] - - [5056, 1408, 1, 128] - - [732, 4303.13] + - [806, 4303.13] - - [2048, 7000, 1, 2048] - - [749, 9269.2] + - [823, 9269.2] - - [256, 3584, 1, 3328] - - [743, 7334.48] + - [817, 7334.48] - - [5056, 704, 1, 256] - - [751, 7954.12] + - [825, 7954.12] - - [128, 1408, 1, 128] - - [646, 1243.02] + - [720, 1243.02] - - [3584, 4288, 1, 3328] - - [775, 7683.81] + - [849, 7683.81] - - [5888, 1856, 1, 1280] - - [739, 8831.34] + - [813, 8831.34] - - [256, 1408, 1, 256] - - [741, 4352.68] + - [815, 4352.68] - - [5056, 64, 1, 1280] - - [708, 5012.05] + - [782, 5012.05] - - [1024, 704, 1, 256] - - [741, 5710.17] + - [815, 5710.17] - - [64, 256, 1, 128] - - [647, 149.897] + - [721, 149.897] - - [2368, 3584, 1, 1280] - - [749, 8609.68] + - [823, 8609.68] - - [1024, 256, 1, 256] - - [765, 3276.9] + - [839, 3276.9] - - [1856, 4, 1, 1280] - - [663, 497.104] + - [737, 497.104] - - [448, 448, 1, 256] - - [751, 3117.83] + - [825, 3117.83] - - [2944, 3584, 1, 3328] - - [739, 8879.45] + - [813, 8879.45] - - [7680, 32, 1, 2560] - - [709, 5310.24] + - [783, 5310.24] - - [128, 4288, 1, 128] - - [649, 2116.2] + - [723, 2116.2] - - [256, 256, 1, 3328] - - [702, 4774.7] + - [776, 4774.7] - - [128, 1024, 1, 3328] - - [703, 5894.8] + - [777, 5894.8] - - [4, 1408, 1, 3328] - - [694, 552.674] + - [768, 552.674] - - [196, 256, 64, 1024] - - [792, 5218.34] + - [866, 5218.34] - - [6784, 2944, 1, 256] - - [757, 8271.18] + - [831, 8271.18] - - [64, 1856, 1, 1280] - - [708, 4167.96] + - [782, 4167.96] - - [64, 1024, 1, 128] - - [642, 589.188] + - [716, 589.188] - - [1024, 1500, 1, 2560] - - [746, 8407.88] + - [820, 8407.88] - - [1856, 2368, 1, 256] - - [741, 8092.15] + - [815, 8092.15] - - [3584, 256, 1, 128] - - [724, 2607.57] + - [798, 2607.57] - - [3584, 6784, 1, 3328] - - [758, 8558.83] + - [832, 8558.83] - - [256, 1024, 1, 256] - - [751, 3901.78] + - [825, 3901.78] - - [4, 6784, 1, 3328] - - [689, 662.575] + - [763, 662.575] - - [1024, 5888, 1, 3328] - - [749, 9161.76] + - [823, 9161.76] - - [1024, 128, 1, 1280] - - [706, 3942.12] + - [780, 3942.12] - - [3072, 32, 1, 1024] - - [683, 2840.49] + - [757, 2840.49] - - [6144, 24000, 1, 2560] - - [739, 7605.87] + - [813, 7605.87] - - [448, 1024, 1, 256] - - [741, 5062.19] + - [815, 5062.19] - - [5056, 4288, 1, 1280] - - [749, 9090.99] + - [823, 9090.99] - - [5888, 64, 1, 256] - - [751, 4449.78] + - [825, 4449.78] - - [1856, 256, 1, 1280] - - [765, 5834.46] + - [839, 5834.46] - - [64, 5888, 1, 3328] - - [703, 6152.44] + - [777, 6152.44] - - [2368, 2368, 1, 1280] - - [743, 8594.66] + - [817, 8594.66] - - [2944, 5888, 1, 128] - - [728, 4776.19] + - [802, 4776.19] - - [704, 5888, 1, 1280] - - [743, 8435.91] + - [817, 8435.91] - - [2368, 3584, 1, 128] - - [725, 4590.71] + - [799, 4590.71] - - [1856, 5056, 1, 128] - - [736, 4503.48] + - [810, 4503.48] - - [4608, 1, 1, 1536] - - [634, 226.955] + - [708, 226.955] - - [448, 256, 1, 3328] - - [678, 5415.56] + - [752, 5415.56] - - [2944, 6784, 1, 1280] - - [762, 8385.11] + - [836, 8385.11] - - [448, 1856, 1, 128] - - [732, 2618.96] + - [806, 2618.96] - - [128, 1024, 1, 128] - - [641, 940.527] + - [715, 940.527] - - [7680, 4, 1, 2560] - - [665, 985.104] + - [739, 985.104] - - [1024, 704, 1, 1280] - - [751, 7204.56] + - [825, 7204.56] - - [128, 5888, 1, 256] - - [741, 6313.52] + - [815, 6313.52] - - [1024, 5056, 1, 1280] - - [746, 8979.76] + - [820, 8979.76] - - [4288, 1024, 1, 256] - - [738, 7198.29] + - [812, 7198.29] - - [2944, 2368, 1, 128] - - [723, 4624.57] + - [797, 4624.57] - - [704, 704, 1, 3328] - - [764, 5870.71] + - [838, 5870.71] - - [704, 1408, 1, 1280] - - [753, 7680.32] + - [827, 7680.32] - - [5888, 448, 1, 1280] - - [741, 7718.66] + - [815, 7718.66] - - [3584, 256, 1, 3328] - - [746, 7523.88] + - [820, 7523.88] - - [704, 5888, 1, 3328] - - [751, 8196.99] + - [825, 8196.99] - - [704, 1856, 1, 128] - - [729, 3388.43] + - [803, 3388.43] - - [128, 3584, 1, 3328] - - [703, 6626.5] + - [777, 6626.5] - - [4, 4288, 1, 128] - - [776, 159.648] + - [850, 159.648] - - [128, 704, 1, 1280] - - [666, 4038.73] + - [740, 4038.73] - - [3584, 2944, 1, 256] - - [739, 7685.99] + - [813, 7685.99] - - [1856, 128, 1, 3328] - - [695, 6070.63] + - [769, 6070.63] - - [1856, 2368, 1, 3328] - - [756, 8460.62] + - [830, 8460.62] - - [512, 6000, 1, 2816] - - [759, 9019.55] + - [833, 9019.55] - - [2944, 448, 1, 128] - - [722, 3027.73] + - [796, 3027.73] - - [64, 193600, 1, 256] - - [765, 7080.32] + - [839, 7080.32] - - [128, 2944, 1, 1280] - - [741, 5397.87] + - [815, 5397.87] - - [448, 2944, 1, 1280] - - [751, 6996.97] + - [825, 6996.97] - - [512, 24000, 1, 2048] - - [759, 8832.67] + - [833, 8832.67] - - [128, 256, 1, 3328] - - [698, 3531.57] + - [772, 3531.57] - - [1408, 5056, 1, 3328] - - [754, 7969.94] + - [828, 7969.94] - - [1856, 1856, 1, 3328] - - [741, 8140.34] + - [815, 8140.34] - - [3584, 128, 1, 256] - - [751, 4861.05] + - [825, 4861.05] - - [448, 1408, 1, 3328] - - [741, 6353.75] + - [815, 6353.75] - - [2368, 2368, 1, 256] - - [755, 8369.37] + - [829, 8369.37] - - [4288, 4288, 1, 1280] - - [745, 8666.52] + - [819, 8666.52] - - [64, 448, 1, 1280] - - [698, 2591.92] + - [772, 2591.92] - - [5888, 1024, 1, 1280] - - [738, 8526.6] + - [812, 8526.6] - - [704, 1024, 1, 256] - - [751, 4971.8] + - [825, 4971.8] - - [1024, 12544, 1, 256] - - [789, 8611.9] + - [863, 8611.9] - - [448, 4, 1, 256] - - [694, 78.6534] + - [768, 78.6534] - - [5888, 448, 1, 128] - - [725, 3592.03] + - [799, 3592.03] - - [512, 48000, 1, 2560] - - [759, 9237.44] + - [833, 9237.44] - - [8448, 16, 1, 2816] - - [624, 3360.21] + - [698, 3360.21] - - [704, 6784, 1, 3328] - - [760, 7774.95] + - [834, 7774.95] - - [5888, 5888, 1, 1280] - - [746, 9238.25] + - [820, 9238.25] - - [5056, 1024, 1, 1280] - - [774, 8227.88] + - [848, 8227.88] - - [448, 5888, 1, 3328] - - [749, 7777.63] + - [823, 7777.63] - - [3072, 2, 1, 1024] - - [686, 376.383] + - [760, 376.383] - - [1024, 2944, 1, 1280] - - [739, 8650.45] + - [813, 8650.45] - - [5056, 5888, 1, 1280] - - [749, 8861.6] + - [823, 8861.6] - - [4288, 5888, 1, 128] - - [729, 5049.01] + - [803, 5049.01] - - [256, 3584, 1, 256] - - [741, 6314.11] + - [815, 6314.11] - - [256, 4, 1, 1280] - - [785, 163.94] + - [859, 163.94] - - [1408, 3584, 1, 128] - - [729, 4290.22] + - [803, 4290.22] - - [256, 2944, 1, 3328] - - [751, 7620.99] + - [825, 7620.99] - - [448, 3584, 1, 128] - - [729, 3353.9] + - [803, 3353.9] - - [5888, 2944, 1, 1280] - - [739, 9498.31] + - [813, 9498.31] - - [4, 6784, 1, 1280] - - [689, 623.916] + - [763, 623.916] - - [2368, 5888, 1, 128] - - [728, 4840.29] + - [802, 4840.29] - - [35, 8457, 1, 1760] - - [635, 4059.88] + - [709, 4059.88] - - [64, 2944, 1, 128] - - [646, 1310.82] + - [720, 1310.82] - - [2368, 4, 1, 256] - - [780, 369.739] + - [854, 369.739] - - [3584, 5888, 1, 256] - - [757, 7996.33] + - [831, 7996.33] - - [2368, 1024, 1, 128] - - [723, 3915.07] + - [797, 3915.07] - - [2368, 704, 1, 128] - - [723, 3658.97] + - [797, 3658.97] - - [512, 32, 1, 512] - - [712, 1127.6] + - [786, 1127.6] - - [3584, 2368, 1, 128] - - [723, 4462.48] + - [797, 4462.48] - - [5056, 704, 1, 128] - - [722, 4062.21] + - [796, 4062.21] - - [448, 2368, 1, 128] - - [723, 2829.07] + - [797, 2829.07] - - [4, 5056, 1, 256] - - [671, 425.868] + - [745, 425.868] - - [5056, 1408, 1, 3328] - - [756, 8848.92] + - [830, 8848.92] - - [1408, 704, 1, 256] - - [751, 5394.56] + - [825, 5394.56] - - [6784, 1024, 1, 3328] - - [738, 9232.02] + - [812, 9232.02] - - [6784, 2944, 1, 3328] - - [749, 8714.84] + - [823, 8714.84] - - [7680, 1, 1, 2560] - - [685, 248.845] + - [759, 248.845] - - [1856, 1856, 1, 256] - - [750, 7586.58] + - [824, 7586.58] - - [64, 64, 1, 3328] - - [720, 1363.25] + - [794, 1363.25] - - [512, 1, 1, 512] - - [634, 43.2158] + - [708, 43.2158] - - [6784, 2368, 1, 1280] - - [751, 8665.74] + - [825, 8665.74] - - [4608, 2, 1, 1536] - - [634, 452.65] + - [708, 452.65] - - [4288, 3584, 1, 256] - - [759, 8936.7] + - [833, 8936.7] - - [4288, 5888, 1, 1280] - - [756, 8957.15] + - [830, 8957.15] - - [4608, 4, 1, 1536] - - [627, 846.737] + - [701, 846.737] - - [1024, 6000, 1, 1536] - - [749, 8398.54] + - [823, 8398.54] - - [8448, 32, 1, 2816] - - [709, 5343.07] + - [783, 5343.07] - - [448, 2944, 1, 3328] - - [756, 7247.04] + - [830, 7247.04] - - [4288, 1856, 1, 1280] - - [739, 8902.86] + - [813, 8902.86] - - [1856, 2944, 1, 3328] - - [751, 8622.86] + - [825, 8622.86] - - [256, 6784, 1, 3328] - - [751, 8050.77] + - [825, 8050.77] - - [512, 3000, 1, 1536] - - [772, 7108.12] + - [846, 7108.12] - - [64, 5888, 1, 256] - - [764, 3567.74] + - [838, 3567.74] - - [256, 5056, 1, 128] - - [731, 3041.12] + - [805, 3041.12] - - [5056, 1024, 1, 256] - - [755, 8401.47] + - [829, 8401.47] - - [704, 64, 1, 3328] - - [714, 4299.02] + - [788, 4299.02] - - [5056, 1856, 1, 3328] - - [759, 8660.77] + - [833, 8660.77] - - [4, 2944, 1, 3328] - - [689, 618.637] + - [763, 618.637] - - [512, 1500, 1, 2048] - - [771, 5481.22] + - [845, 5481.22] - - [1024, 1, 1, 500000] - - [625, 260.061] + - [699, 260.061] - - [256, 4, 1, 256] - - [689, 50.5123] + - [763, 50.5123] - - [6784, 128, 1, 3328] - - [743, 6950.91] + - [817, 6950.91] - - [4288, 1408, 1, 128] - - [723, 4539.58] + - [797, 4539.58] - - [1856, 5888, 1, 3328] - - [749, 8712.93] + - [823, 8712.93] - - [4288, 5056, 1, 256] - - [755, 8997.15] + - [829, 8997.15] - - [1408, 128, 1, 1280] - - [678, 4599.12] + - [752, 4599.12] - - [4096, 7000, 1, 4096] - - [745, 8555.89] + - [819, 8555.89] - - [5056, 256, 1, 3328] - - [751, 8257.16] + - [825, 8257.16] - - [704, 704, 1, 256] - - [741, 5852.39] + - [815, 5852.39] - - [1024, 3000, 1, 2560] - - [738, 8258.84] + - [812, 8258.84] - - [1024, 5888, 1, 1280] - - [738, 8988.99] + - [812, 8988.99] - - [6784, 2368, 1, 128] - - [724, 4562.25] + - [798, 4562.25] - - [4, 5056, 1, 1280] - - [689, 600.441] + - [763, 600.441] - - [256, 64, 1, 1280] - - [712, 1899.69] + - [786, 1899.69] - - [128, 1856, 1, 1280] - - [751, 5185.76] + - [825, 5185.76] - - [1856, 1024, 1, 1280] - - [756, 7875.95] + - [830, 7875.95] - - [6784, 4288, 1, 1280] - - [759, 8981.18] + - [833, 8981.18] - - [1856, 1856, 1, 1280] - - [740, 7794.71] + - [814, 7794.71] - - [35, 1500, 1, 2048] - - [640, 2192.6] + - [714, 2192.6] - - [3072, 24000, 1, 1024] - - [752, 8690.58] + - [826, 8690.58] - - [1408, 5056, 1, 1280] - - [751, 8427.87] + - [825, 8427.87] - - [4, 2368, 1, 3328] - - [694, 594.422] + - [768, 594.422] - - [5888, 1856, 1, 128] - - [723, 4294.05] + - [797, 4294.05] - - [448, 704, 1, 1280] - - [746, 4136.39] + - [820, 4136.39] - - [448, 6784, 1, 128] - - [724, 3976.2] + - [798, 3976.2] - - [1024, 448, 1, 3328] - - [756, 6376.33] + - [830, 6376.33] - - [2944, 128, 1, 256] - - [741, 4466.26] + - [815, 4466.26] - - [5056, 3584, 1, 128] - - [729, 4997.18] + - [803, 4997.18] - - [5888, 5888, 1, 3328] - - [759, 8870.37] + - [833, 8870.37] - - [6784, 1024, 1, 256] - - [738, 8520.53] + - [812, 8520.53] - - [2944, 2368, 1, 256] - - [775, 6174.59] + - [849, 6174.59] - - [256, 448, 1, 256] - - [751, 1844.33] + - [825, 1844.33] - - [5056, 5888, 1, 3328] - - [740, 8076.65] + - [814, 8076.65] - - [1856, 1024, 1, 256] - - [751, 7188.92] + - [825, 7188.92] - - [512, 48000, 1, 1536] - - [762, 7282.2] + - [836, 7282.2] - - [3584, 448, 1, 1280] - - [741, 6869.1] + - [815, 6869.1] - - [1024, 1024, 1, 1280] - - [751, 8027.45] + - [825, 8027.45] - - [448, 5888, 1, 256] - - [741, 5765.84] + - [815, 5765.84] - - [2048, 128, 1, 2048] - - [699, 4835.01] + - [773, 4835.01] - - [1408, 6784, 1, 3328] - - [751, 8613.76] + - [825, 8613.76] - - [448, 1024, 1, 128] - - [722, 2315.57] + - [796, 2315.57] - - [4288, 704, 1, 128] - - [723, 4138.92] + - [797, 4138.92] - - [128, 1856, 1, 128] - - [658, 1397.56] + - [732, 1397.56] - - [448, 2368, 1, 3328] - - [741, 6786.48] + - [815, 6786.48] - - [5056, 64, 1, 128] - - [723, 1664.84] + - [797, 1664.84] - - [5056, 2944, 1, 256] - - [774, 7697.49] + - [848, 7697.49] - - [6784, 5888, 1, 128] - - [723, 5003.67] + - [797, 5003.67] - - [1024, 700, 1, 512] - - [751, 6036.31] + - [825, 6036.31] - - [3072, 1, 1, 128] - - [705, 70.3171] + - [779, 70.3171] - - [1024, 4, 1, 256] - - [663, 154.302] + - [737, 154.302] - - [2944, 704, 1, 128] - - [729, 3697.0] + - [803, 3697.0] - - [128, 6784, 1, 1280] - - [741, 6731.51] + - [815, 6731.51] - - [1408, 3584, 1, 3328] - - [739, 9258.07] + - [813, 9258.07] - - [2368, 6784, 1, 256] - - [738, 8840.4] + - [812, 8840.4] - - [5056, 1408, 1, 1280] - - [739, 9240.84] + - [813, 9240.84] - - [5056, 4288, 1, 128] - - [734, 4309.18] + - [808, 4309.18] - - [4, 704, 1, 256] - - [689, 130.697] + - [763, 130.697] - - [4288, 2368, 1, 3328] - - [752, 8755.33] + - [826, 8755.33] - - [1408, 1856, 1, 128] - - [722, 3918.75] + - [796, 3918.75] - - [1408, 5888, 1, 3328] - - [759, 8910.47] + - [833, 8910.47] - - [1856, 256, 1, 256] - - [741, 5631.34] + - [815, 5631.34] - - [6784, 6784, 1, 256] - - [749, 9298.76] + - [823, 9298.76] - - [5888, 5056, 1, 128] - - [724, 4811.36] + - [798, 4811.36] - - [4288, 2368, 1, 128] - - [723, 4749.1] + - [797, 4749.1] - - [128, 5888, 1, 1280] - - [750, 6393.86] + - [824, 6393.86] - - [256, 4288, 1, 1280] - - [741, 6887.79] + - [815, 6887.79] - - [2368, 2944, 1, 256] - - [755, 8314.82] + - [829, 8314.82] - - [4, 1856, 1, 256] - - [778, 267.03] + - [852, 267.03] - - [3584, 1856, 1, 1280] - - [739, 8631.91] + - [813, 8631.91] - - [6784, 6784, 1, 128] - - [729, 5059.96] + - [803, 5059.96] - - [256, 1856, 1, 128] - - [722, 1858.82] + - [796, 1858.82] - - [49, 512, 64, 2048] - - [793, 3053.67] + - [867, 3053.67] - - [704, 64, 1, 1280] - - [672, 2849.49] + - [746, 2849.49] - - [5888, 5056, 1, 256] - - [758, 8202.52] + - [832, 8202.52] - - [8448, 48000, 1, 2816] - - [749, 4281.94] + - [823, 4281.94] - - [512, 6000, 1, 2048] - - [741, 8047.89] + - [815, 8047.89] - - [3584, 448, 1, 256] - - [751, 6805.43] + - [825, 6805.43] - - [448, 4288, 1, 128] - - [729, 3500.83] + - [803, 3500.83] - - [7680, 64, 1, 2560] - - [684, 5957.9] + - [758, 5957.9] - - [256, 6784, 1, 256] - - [751, 7331.83] + - [825, 7331.83] - - [1408, 4288, 1, 128] - - [723, 4501.49] + - [797, 4501.49] - - [2944, 704, 1, 3328] - - [751, 8439.7] + - [825, 8439.7] - - [128, 448, 1, 256] - - [672, 1555.19] + - [746, 1555.19] - - [2048, 32, 1, 2048] - - [683, 3226.49] + - [757, 3226.49] - - [3584, 3584, 1, 256] - - [755, 8784.9] + - [829, 8784.9] - - [448, 1408, 1, 128] - - [722, 2535.92] + - [796, 2535.92] - - [128, 256, 1, 1280] - - [698, 2896.72] + - [772, 2896.72] - - [3584, 5056, 1, 256] - - [742, 8566.52] + - [816, 8566.52] - - [6784, 128, 1, 256] - - [741, 6053.97] + - [815, 6053.97] - - [4288, 4, 1, 256] - - [661, 428.9] + - [735, 428.9] - - [64, 1408, 1, 3328] - - [666, 5025.11] + - [740, 5025.11] - - [704, 448, 1, 256] - - [765, 3409.74] + - [839, 3409.74] - - [2944, 2368, 1, 1280] - - [739, 9066.35] + - [813, 9066.35] - - [448, 64, 1, 3328] - - [714, 3528.96] + - [788, 3528.96] - - [704, 6784, 1, 128] - - [728, 4212.61] + - [802, 4212.61] - - [3584, 4, 1, 3328] - - [781, 658.353] + - [855, 658.353] - - [6784, 3584, 1, 256] - - [749, 9061.84] + - [823, 9061.84] - - [704, 448, 1, 128] - - [728, 1552.8] + - [802, 1552.8] - - [256, 128, 1, 128] - - [653, 281.975] + - [727, 281.975] - - [704, 1408, 1, 128] - - [728, 3026.76] + - [802, 3026.76] - - [4, 448, 1, 128] - - [777, 5.56127] + - [851, 5.56127] - - [4288, 128, 1, 1280] - - [708, 5471.64] + - [782, 5471.64] - - [128, 1408, 1, 256] - - [751, 2813.35] + - [825, 2813.35] - - [4, 2944, 1, 256] - - [671, 316.766] + - [745, 316.766] - - [64, 128, 1, 3328] - - [719, 1872.56] + - [793, 1872.56] - - [1856, 1408, 1, 256] - - [741, 7735.89] + - [815, 7735.89] - - [5056, 2368, 1, 128] - - [723, 4830.19] + - [797, 4830.19] - - [2944, 2944, 1, 3328] - - [759, 8890.11] + - [833, 8890.11] - - [5056, 6784, 1, 256] - - [749, 9015.25] + - [823, 9015.25] - - [1856, 3584, 1, 128] - - [730, 4455.12] + - [804, 4455.12] - - [5888, 4, 1, 1280] - - [779, 642.063] + - [853, 642.063] - - [128, 2944, 1, 128] - - [648, 2037.03] + - [722, 2037.03] - - [35, 8457, 1, 2560] - - [636, 3988.23] + - [710, 3988.23] - - [3584, 6784, 1, 128] - - [723, 4774.54] + - [797, 4774.54] - - [128, 4288, 1, 256] - - [741, 4851.85] + - [815, 4851.85] - - [704, 448, 1, 3328] - - [756, 4432.63] + - [830, 4432.63] - - [2368, 6784, 1, 1280] - - [739, 9161.48] + - [813, 9161.48] - - [128, 128, 1, 3328] - - [713, 2839.99] + - [787, 2839.99] - - [5056, 1856, 1, 256] - - [755, 8380.94] + - [829, 8380.94] - - [256, 128, 1, 256] - - [697, 1165.18] + - [771, 1165.18] - - [1024, 3000, 1, 2816] - - [756, 8714.27] + - [830, 8714.27] - - [1024, 1856, 1, 256] - - [746, 7014.79] + - [820, 7014.79] - - [64, 1, 1, 1216] - - [719, 11.8205] + - [793, 11.8205] - - [4288, 64, 1, 128] - - [650, 1669.65] + - [724, 1669.65] - - [256, 448, 1, 3328] - - [674, 5152.39] + - [748, 5152.39] - - [1408, 6784, 1, 1280] - - [759, 8735.22] + - [833, 8735.22] - - [3584, 3584, 1, 1280] - - [756, 9020.09] + - [830, 9020.09] - - [7680, 24000, 1, 2560] - - [759, 6940.24] + - [833, 6940.24] - - [64, 2368, 1, 1280] - - [669, 4433.07] + - [743, 4433.07] - - [448, 2368, 1, 1280] - - [744, 5352.92] + - [818, 5352.92] - - [4608, 48000, 1, 1536] - - [738, 8129.11] + - [812, 8129.11] - - [5888, 5888, 1, 128] - - [731, 4700.91] + - [805, 4700.91] - - [64, 6784, 1, 3328] - - [741, 6170.82] + - [815, 6170.82] - - [2944, 256, 1, 1280] - - [771, 6177.65] + - [845, 6177.65] - - [2048, 16, 1, 2048] - - [693, 2167.7] + - [767, 2167.7] - - [256, 2368, 1, 128] - - [722, 2037.77] + - [796, 2037.77] - - [5056, 2368, 1, 3328] - - [739, 9040.6] + - [813, 9040.6] - - [2944, 4288, 1, 256] - - [770, 7552.22] + - [844, 7552.22] - - [1408, 3584, 1, 1280] - - [746, 8808.76] + - [820, 8808.76] - - [2368, 64, 1, 256] - - [682, 2320.51] + - [756, 2320.51] - - [1024, 128, 1, 128] - - [642, 1075.56] + - [716, 1075.56] - - [704, 128, 1, 3328] - - [675, 4985.02] + - [749, 4985.02] - - [5888, 4, 1, 128] - - [776, 33.6558] + - [850, 33.6558] - - [1856, 704, 1, 256] - - [751, 7110.98] + - [825, 7110.98] - - [1024, 1500, 1, 2816] - - [746, 8499.88] + - [820, 8499.88] - - [8448, 1, 1, 2816] - - [629, 251.469] + - [703, 251.469] - - [1024, 4, 1, 3328] - - [785, 541.032] + - [859, 541.032] - - [1024, 6000, 1, 2048] - - [746, 8698.59] + - [820, 8698.59] - - [512, 24000, 1, 2560] - - [739, 8963.7] + - [813, 8963.7] - - [6144, 3000, 1, 2560] - - [762, 8761.97] + - [836, 8761.97] - - [2368, 6784, 1, 3328] - - [756, 8867.49] + - [830, 8867.49] - - [1856, 1408, 1, 1280] - - [743, 7908.53] + - [817, 7908.53] - - [1856, 448, 1, 1280] - - [756, 6544.01] + - [830, 6544.01] - - [6784, 704, 1, 128] - - [722, 4086.45] + - [796, 4086.45] - - [4, 4, 1, 256] - - [689, 0.852941] + - [763, 0.852941] - - [128, 5888, 1, 128] - - [646, 2582.25] + - [720, 2582.25] - - [5056, 2944, 1, 128] - - [726, 4579.17] + - [800, 4579.17] - - [1408, 5888, 1, 256] - - [738, 8810.77] + - [812, 8810.77] - - [704, 2944, 1, 1280] - - [753, 8420.9] + - [827, 8420.9] - - [4288, 64, 1, 1280] - - [678, 4906.15] + - [752, 4906.15] - - [256, 64, 1, 256] - - [680, 689.953] + - [754, 689.953] - - [1024, 1024, 1, 256] - - [756, 5528.01] + - [830, 5528.01] - - [704, 1856, 1, 256] - - [740, 4452.92] + - [814, 4452.92] - - [2560, 64, 1, 2560] - - [669, 4563.09] + - [743, 4563.09] - - [3584, 704, 1, 1280] - - [746, 7898.77] + - [820, 7898.77] - - [256, 128, 1, 1280] - - [698, 2865.06] + - [772, 2865.06] - - [5888, 2368, 1, 256] - - [745, 8628.37] + - [819, 8628.37] - - [256, 2368, 1, 1280] - - [741, 6073.57] + - [815, 6073.57] - - [2944, 6784, 1, 128] - - [722, 4756.77] + - [796, 4756.77] - - [3584, 448, 1, 3328] - - [741, 7265.07] + - [815, 7265.07] - - [1408, 4, 1, 256] - - [782, 234.157] + - [856, 234.157] - - [704, 2368, 1, 3328] - - [739, 7248.98] + - [813, 7248.98] - - [2944, 448, 1, 256] - - [746, 6365.89] + - [820, 6365.89] - - [1856, 448, 1, 128] - - [724, 2976.34] + - [798, 2976.34] - - [4608, 6000, 1, 1536] - - [759, 9469.42] + - [833, 9469.42] - - [2368, 128, 1, 1280] - - [708, 4773.39] + - [782, 4773.39] - - [256, 5888, 1, 128] - - [723, 3112.0] + - [797, 3112.0] - - [64, 6784, 1, 256] - - [741, 3755.14] + - [815, 3755.14] - - [64, 5056, 1, 1280] - - [702, 4935.6] + - [776, 4935.6] - - [4, 6784, 1, 128] - - [777, 111.142] + - [851, 111.142] - - [3025, 64, 64, 64] - - [791, 6643.75] + - [865, 6643.75] - - [2944, 2944, 1, 1280] - - [739, 8869.55] + - [813, 8869.55] - - [5056, 448, 1, 3328] - - [772, 6706.2] + - [846, 6706.2] - - [4, 3584, 1, 1280] - - [689, 573.54] + - [763, 573.54] - - [1408, 128, 1, 128] - - [641, 1293.19] + - [715, 1293.19] - - [6784, 704, 1, 3328] - - [756, 8368.33] + - [830, 8368.33] - - [128, 64, 1, 1280] - - [715, 1260.41] + - [789, 1260.41] - - [2368, 256, 1, 1280] - - [741, 6154.47] + - [815, 6154.47] - - [4, 448, 1, 3328] - - [694, 351.738] + - [768, 351.738] - - [5888, 4288, 1, 128] - - [723, 4340.99] + - [797, 4340.99] - - [4, 5888, 1, 256] - - [671, 428.318] + - [745, 428.318] - - [1408, 2944, 1, 3328] - - [738, 9400.85] + - [812, 9400.85] - - [3584, 704, 1, 128] - - [725, 3392.55] + - [799, 3392.55] - - [64, 1024, 1, 256] - - [672, 1762.41] + - [746, 1762.41] - - [2368, 448, 1, 1280] - - [765, 5972.58] + - [839, 5972.58] - - [128, 3584, 1, 256] - - [741, 5224.32] + - [815, 5224.32] - - [704, 448, 1, 1280] - - [741, 4566.86] + - [815, 4566.86] - - [448, 5056, 1, 128] - - [723, 3876.19] + - [797, 3876.19] - - [6144, 4, 1, 2560] - - [665, 948.751] + - [739, 948.751] - - [5056, 3584, 1, 256] - - [755, 8162.56] + - [829, 8162.56] - - [4288, 4288, 1, 256] - - [762, 7653.34] + - [836, 7653.34] - - [1408, 5056, 1, 128] - - [729, 4554.34] + - [803, 4554.34] - - [2944, 3584, 1, 128] - - [735, 4147.0] + - [809, 4147.0] - - [3584, 2368, 1, 256] - - [756, 8195.05] + - [830, 8195.05] - - [5888, 5056, 1, 1280] - - [755, 9413.43] + - [829, 9413.43] - - [128, 1024, 1, 1280] - - [708, 4433.83] + - [782, 4433.83] - - [8448, 24000, 1, 2816] - - [749, 5227.12] + - [823, 5227.12] - - [64, 704, 1, 256] - - [672, 1441.89] + - [746, 1441.89] - - [4288, 256, 1, 1280] - - [771, 5687.8] + - [845, 5687.8] - - [3584, 3584, 1, 3328] - - [746, 9183.63] + - [820, 9183.63] - - [704, 64, 1, 128] - - [650, 402.835] + - [724, 402.835] - - [3072, 1500, 1, 128] - - [745, 7395.08] + - [819, 7395.08] - - [2048, 3136, 1, 512] - - [787, 8447.3] + - [861, 8447.3] - - [3025, 256, 64, 64] - - [795, 8063.79] + - [869, 8063.79] - - [5888, 6784, 1, 256] - - [739, 9282.01] + - [813, 9282.01] - - [4288, 2944, 1, 3328] - - [739, 9153.87] + - [813, 9153.87] - - [2944, 64, 1, 128] - - [656, 1463.53] + - [730, 1463.53] - - [1024, 128, 1, 3328] - - [706, 5377.41] + - [780, 5377.41] - - [1024, 16, 1, 500000] - - [622, 3997.13] + - [696, 3997.13] - - [4288, 128, 1, 3328] - - [710, 6053.31] + - [784, 6053.31] - - [7680, 128, 1, 2560] - - [756, 7769.24] + - [830, 7769.24] - - [256, 5056, 1, 1280] - - [765, 7200.84] + - [839, 7200.84] - - [1408, 256, 1, 128] - - [733, 1671.74] + - [807, 1671.74] - - [2944, 5888, 1, 3328] - - [745, 8642.18] + - [819, 8642.18] - - [6784, 5888, 1, 1280] - - [759, 8871.15] + - [833, 8871.15] - - [3072, 1, 1, 1024] - - [705, 205.972] + - [779, 205.972] - - [704, 128, 1, 256] - - [668, 1935.39] + - [742, 1935.39] - - [5888, 4288, 1, 1280] - - [746, 9176.7] + - [820, 9176.7] - - [1024, 24000, 1, 2048] - - [745, 8667.79] + - [819, 8667.79] - - [448, 256, 1, 1280] - - [678, 4327.95] + - [752, 4327.95] - - [5888, 3584, 1, 128] - - [723, 4669.45] + - [797, 4669.45] - - [64, 4288, 1, 3328] - - [703, 5375.04] + - [777, 5375.04] - - [448, 4, 1, 1280] - - [694, 289.716] + - [768, 289.716] - - [6784, 6784, 1, 3328] - - [752, 8306.73] + - [826, 8306.73] - - [5056, 4, 1, 1280] - - [664, 607.199] + - [738, 607.199] - - [4, 5888, 1, 3328] - - [689, 651.538] + - [763, 651.538] - - [256, 1408, 1, 1280] - - [741, 5177.09] + - [815, 5177.09] - - [3072, 16, 1, 1024] - - [700, 2207.63] + - [774, 2207.63] - - [704, 3584, 1, 128] - - [733, 3653.51] + - [807, 3653.51] - - [1024, 2, 1, 512] - - [720, 156.138] + - [794, 156.138] - - [5888, 448, 1, 3328] - - [741, 7896.85] + - [815, 7896.85] - - [2368, 4288, 1, 1280] - - [738, 8517.63] + - [812, 8517.63] - - [4288, 2944, 1, 128] - - [727, 4439.26] + - [801, 4439.26] - - [256, 64, 1, 3328] - - [713, 2704.76] + - [787, 2704.76] - - [2944, 64, 1, 3328] - - [678, 5647.15] + - [752, 5647.15] - - [6784, 64, 1, 3328] - - [751, 6434.61] + - [825, 6434.61] - - [5056, 2944, 1, 3328] - - [762, 8497.2] + - [836, 8497.2] - - [448, 128, 1, 256] - - [680, 1516.64] + - [754, 1516.64] - - [2944, 3584, 1, 256] - - [756, 8365.83] + - [830, 8365.83] - - [1408, 1408, 1, 3328] - - [739, 8440.42] + - [813, 8440.42] - - [1856, 128, 1, 1280] - - [741, 5242.93] + - [815, 5242.93] - - [3584, 3584, 1, 128] - - [723, 4385.94] + - [797, 4385.94] - - [64, 3584, 1, 256] - - [741, 3276.9] + - [815, 3276.9] - - [1408, 4, 1, 3328] - - [664, 605.504] + - [738, 605.504] - - [128, 2944, 1, 3328] - - [709, 6295.75] + - [783, 6295.75] - - [3584, 704, 1, 256] - - [746, 7711.64] + - [820, 7711.64] - - [2944, 448, 1, 3328] - - [757, 6503.97] + - [831, 6503.97] - - [1024, 2, 1, 500000] - - [626, 521.803] + - [700, 521.803] - - [3584, 1408, 1, 3328] - - [748, 8296.2] + - [822, 8296.2] - - [704, 3584, 1, 1280] - - [753, 7670.65] + - [827, 7670.65] - - [1024, 1408, 1, 128] - - [728, 2830.61] + - [802, 2830.61] - - [1856, 6784, 1, 256] - - [759, 8149.67] + - [833, 8149.67] - - [4288, 448, 1, 3328] - - [740, 7406.44] + - [814, 7406.44] - - [6784, 4288, 1, 128] - - [735, 4418.09] + - [809, 4418.09] - - [6784, 704, 1, 1280] - - [756, 8302.45] + - [830, 8302.45] - - [6144, 1, 1, 2560] - - [665, 243.427] + - [739, 243.427] - - [3584, 6784, 1, 256] - - [738, 9036.59] + - [812, 9036.59] - - [6144, 16, 1, 2560] - - [672, 3266.69] + - [746, 3266.69] - - [3584, 64, 1, 128] - - [656, 1555.19] + - [730, 1555.19] - - [5888, 1024, 1, 3328] - - [746, 8888.08] + - [820, 8888.08] - - [448, 64, 1, 128] - - [642, 248.074] + - [716, 248.074] - - [704, 6784, 1, 1280] - - [742, 7892.56] + - [816, 7892.56] - - [4, 448, 1, 256] - - [664, 70.8951] + - [738, 70.8951] - - [196, 1024, 64, 256] - - [790, 6630.86] + - [864, 6630.86] - - [5888, 128, 1, 256] - - [740, 5715.09] + - [814, 5715.09] - - [4096, 16, 1, 4096] - - [686, 3251.5] + - [760, 3251.5] - - [1856, 5056, 1, 3328] - - [755, 8740.27] + - [829, 8740.27] - - [4, 6784, 1, 256] - - [778, 360.412] + - [852, 360.412] - - [1024, 3584, 1, 128] - - [723, 3456.27] + - [797, 3456.27] - - [64, 704, 1, 3328] - - [691, 3817.47] + - [765, 3817.47] - - [2368, 2944, 1, 128] - - [729, 4605.47] + - [803, 4605.47] - - [5056, 64, 1, 256] - - [741, 3863.79] + - [815, 3863.79] - - [512, 1500, 1, 1536] - - [741, 6801.56] + - [815, 6801.56] - - [512, 1, 1, 500000] - - [630, 261.068] + - [704, 261.068] - - [5888, 2944, 1, 3328] - - [745, 8501.88] + - [819, 8501.88] - - [128, 3584, 1, 1280] - - [746, 5938.64] + - [820, 5938.64] - - [1024, 704, 1, 128] - - [732, 2172.29] + - [806, 2172.29] - - [1408, 2368, 1, 128] - - [728, 4023.2] + - [802, 4023.2] - - [5888, 2368, 1, 128] - - [729, 4424.62] + - [803, 4424.62] - - [128, 5056, 1, 3328] - - [741, 6692.16] + - [815, 6692.16] - - [3584, 6784, 1, 1280] - - [739, 9488.64] + - [813, 9488.64] - - [4288, 1856, 1, 256] - - [749, 8287.52] + - [823, 8287.52] - - [1856, 5888, 1, 256] - - [760, 7707.83] + - [834, 7707.83] - - [256, 256, 1, 256] - - [707, 1613.29] + - [781, 1613.29] - - [4288, 4288, 1, 3328] - - [749, 8923.59] + - [823, 8923.59] - - [1024, 1024, 1, 128] - - [729, 2553.71] + - [803, 2553.71] - - [4288, 1408, 1, 1280] - - [749, 8930.47] + - [823, 8930.47] - - [3584, 5056, 1, 128] - - [733, 4495.15] + - [807, 4495.15] - - [4, 1024, 1, 3328] - - [689, 415.694] + - [763, 415.694] - - [4, 704, 1, 128] - - [777, 13.9634] + - [851, 13.9634] - - [4288, 2368, 1, 256] - - [774, 7135.08] + - [848, 7135.08] - - [2944, 5056, 1, 1280] - - [746, 9118.61] + - [820, 9118.61] - - [448, 6784, 1, 256] - - [770, 5430.31] + - [844, 5430.31] - - [64, 128, 1, 128] - - [653, 83.057] + - [727, 83.057] - - [1856, 2368, 1, 128] - - [729, 4422.75] + - [803, 4422.75] - - [6784, 2368, 1, 3328] - - [742, 8769.4] + - [816, 8769.4] - - [1408, 6784, 1, 128] - - [729, 4739.0] + - [803, 4739.0] - - [256, 1024, 1, 1280] - - [751, 5722.21] + - [825, 5722.21] - - [704, 4, 1, 128] - - [777, 8.66578] + - [851, 8.66578] - - [1408, 4, 1, 128] - - [777, 26.1439] + - [851, 26.1439] - - [4288, 128, 1, 256] - - [751, 4865.38] + - [825, 4865.38] - - [4288, 1856, 1, 3328] - - [738, 9250.04] + - [812, 9250.04] - - [3584, 448, 1, 128] - - [729, 3029.59] + - [803, 3029.59] - - [64, 4288, 1, 128] - - [646, 1535.38] + - [720, 1535.38] - - [64, 448, 1, 3328] - - [716, 3457.36] + - [790, 3457.36] - - [448, 4, 1, 3328] - - [694, 367.328] + - [768, 367.328] - - [256, 4, 1, 3328] - - [785, 320.389] + - [859, 320.389] - - [4, 1408, 1, 1280] - - [782, 344.039] + - [856, 344.039] - - [3584, 64, 1, 1280] - - [670, 5191.07] + - [744, 5191.07] - - [1408, 448, 1, 128] - - [730, 2218.24] + - [804, 2218.24] - - [3584, 1024, 1, 1280] - - [752, 8253.11] + - [826, 8253.11] - - [1856, 5056, 1, 256] - - [770, 7552.55] + - [844, 7552.55] - - [4, 3584, 1, 256] - - [689, 325.456] + - [763, 325.456] - - [6784, 4288, 1, 3328] - - [745, 8655.34] + - [819, 8655.34] - - [4, 2944, 1, 1280] - - [689, 547.821] + - [763, 547.821] - - [1024, 4288, 1, 256] - - [746, 7788.83] + - [820, 7788.83] - - [5888, 3584, 1, 3328] - - [749, 9173.39] + - [823, 9173.39] - - [1856, 4, 1, 256] - - [780, 282.919] + - [854, 282.919] - - [4, 256, 1, 256] - - [689, 49.7485] + - [763, 49.7485] - - [5056, 3584, 1, 3328] - - [755, 8457.53] + - [829, 8457.53] - - [1408, 128, 1, 3328] - - [709, 5714.52] + - [783, 5714.52] - - [4, 64, 1, 1280] - - [785, 42.7667] + - [859, 42.7667] - - [2368, 1408, 1, 1280] - - [746, 8224.92] + - [820, 8224.92] - - [5056, 2944, 1, 1280] - - [738, 9295.13] + - [812, 9295.13] - - [8448, 6000, 1, 2816] - - [742, 8037.97] + - [816, 8037.97] - - [4, 4, 1, 128] - - [777, 0.1433898] + - [851, 0.1433898] - - [3584, 256, 1, 256] - - [741, 6116.79] + - [815, 6116.79] - - [3584, 2944, 1, 1280] - - [738, 8796.49] + - [812, 8796.49] - - [1024, 6784, 1, 256] - - [745, 8187.86] + - [819, 8187.86] - - [4, 128, 1, 256] - - [689, 30.4407] + - [763, 30.4407] - - [6784, 448, 1, 256] - - [741, 7862.3] + - [815, 7862.3] - - [5124, 9124, 1, 2048] - - [743, 8176.41] + - [817, 8176.41] - - [2944, 5056, 1, 3328] - - [738, 9328.34] + - [812, 9328.34] - - [6784, 4, 1, 128] - - [776, 204.9] + - [850, 204.9] - - [2944, 1408, 1, 128] - - [727, 3838.2] + - [801, 3838.2] - - [448, 128, 1, 3328] - - [692, 4632.16] + - [766, 4632.16] - - [64, 2944, 1, 3328] - - [709, 5663.47] + - [783, 5663.47] - - [5056, 6784, 1, 3328] - - [745, 8420.17] + - [819, 8420.17] - - [704, 2368, 1, 128] - - [729, 3321.79] + - [803, 3321.79] - - [3072, 1500, 1, 1024] - - [746, 8221.77] + - [820, 8221.77] - - [128, 2944, 1, 256] - - [741, 4550.52] + - [815, 4550.52] - - [128, 6784, 1, 128] - - [646, 2767.76] + - [720, 2767.76] - - [3584, 4288, 1, 256] - - [745, 8808.64] + - [819, 8808.64] - - [448, 1856, 1, 256] - - [750, 5166.63] + - [824, 5166.63] - - [1856, 6784, 1, 3328] - - [742, 8339.76] + - [816, 8339.76] - - [3584, 128, 1, 3328] - - [751, 6791.57] + - [825, 6791.57] - - [64, 1856, 1, 256] - - [673, 2210.03] + - [747, 2210.03] - - [64, 448, 1, 256] - - [705, 1008.35] + - [779, 1008.35] - - [5888, 4288, 1, 256] - - [745, 8869.63] + - [819, 8869.63] - - [128, 1500, 1, 1280] - - [702, 4733.54] + - [776, 4733.54] - - [5056, 1408, 1, 256] - - [743, 7523.31] + - [817, 7523.31] - - [35, 8457, 1, 4096] - - [636, 4023.17] + - [710, 4023.17] - - [64, 256, 1, 1280] - - [697, 1941.91] + - [771, 1941.91] - - [2944, 4, 1, 128] - - [776, 95.7426] + - [850, 95.7426] - - [3584, 1024, 1, 256] - - [768, 6553.68] + - [842, 6553.68] - - [512, 6000, 1, 1536] - - [742, 7357.25] + - [816, 7357.25] - - [256, 704, 1, 256] - - [741, 2912.81] + - [815, 2912.81] - - [5888, 5888, 1, 256] - - [752, 8802.7] + - [826, 8802.7] - - [4288, 1024, 1, 1280] - - [745, 8248.83] + - [819, 8248.83] - - [5888, 128, 1, 3328] - - [695, 6848.59] + - [769, 6848.59] - - [448, 6784, 1, 3328] - - [741, 8343.78] + - [815, 8343.78] - - [2944, 1408, 1, 1280] - - [738, 9229.48] + - [812, 9229.48] - - [3072, 6000, 1, 1024] - - [759, 9015.01] + - [833, 9015.01] - - [1024, 32, 1, 512] - - [680, 1498.07] + - [754, 1498.07] - - [2944, 1856, 1, 3328] - - [755, 7176.48] + - [829, 7176.48] - - [2368, 64, 1, 128] - - [646, 1206.48] + - [720, 1206.48] - - [256, 1024, 1, 128] - - [723, 1178.28] + - [797, 1178.28] - - [3584, 5888, 1, 1280] - - [745, 9023.58] + - [819, 9023.58] - - [64, 4, 1, 128] - - [777, 1.089372] + - [851, 1.089372] - - [6784, 1856, 1, 1280] - - [739, 8964.51] + - [813, 8964.51] - - [2944, 5056, 1, 256] - - [745, 8860.12] + - [819, 8860.12] - - [5888, 256, 1, 3328] - - [756, 8308.66] + - [830, 8308.66] - - [2944, 4288, 1, 128] - - [724, 4507.61] + - [798, 4507.61] - - [3584, 1408, 1, 256] - - [739, 8234.71] + - [813, 8234.71] - - [704, 3584, 1, 3328] - - [751, 7377.26] + - [825, 7377.26] - - [5056, 448, 1, 1280] - - [740, 7145.47] + - [814, 7145.47] - - [3584, 1856, 1, 3328] - - [756, 8954.81] + - [830, 8954.81] - - [64, 1408, 1, 128] - - [653, 731.974] + - [727, 731.974] - - [4288, 6784, 1, 1280] - - [745, 9166.55] + - [819, 9166.55] - - [1024, 3000, 1, 2048] - - [756, 7723.83] + - [830, 7723.83] - - [1408, 704, 1, 1280] - - [746, 7863.1] + - [820, 7863.1] - - [2944, 1024, 1, 256] - - [739, 5035.02] + - [813, 5035.02] - - [256, 64, 1, 128] - - [645, 150.757] + - [719, 150.757] - - [2368, 4288, 1, 3328] - - [743, 8568.84] + - [817, 8568.84] - - [4, 1408, 1, 256] - - [689, 219.885] + - [763, 219.885] - - [1024, 1408, 1, 1280] - - [771, 6761.13] + - [845, 6761.13] - - [64, 64, 1, 256] - - [671, 198.694] + - [745, 198.694] - - [704, 256, 1, 3328] - - [741, 4291.62] + - [815, 4291.62] - - [6784, 5056, 1, 256] - - [740, 8545.02] + - [814, 8545.02] - - [1856, 1856, 1, 128] - - [728, 4034.93] + - [802, 4034.93] - - [4288, 5888, 1, 256] - - [759, 8998.05] + - [833, 8998.05] - - [4, 704, 1, 3328] - - [694, 452.4] + - [768, 452.4] - - [35, 8457, 1, 2048] - - [637, 3375.37] + - [711, 3375.37] - - [448, 2944, 1, 256] - - [741, 6346.74] + - [815, 6346.74] - - [4, 4288, 1, 3328] - - [694, 630.978] + - [768, 630.978] - - [2944, 6784, 1, 256] - - [768, 8002.92] + - [842, 8002.92] - - [2944, 2944, 1, 128] - - [723, 4661.41] + - [797, 4661.41] - - [4, 4, 1, 1280] - - [694, 3.14762] + - [768, 3.14762] - - [1856, 3584, 1, 1280] - - [738, 8677.66] + - [812, 8677.66] - - [64, 2944, 1, 256] - - [741, 2926.95] + - [815, 2926.95] - - [3584, 1408, 1, 1280] - - [752, 8238.9] + - [826, 8238.9] - - [448, 256, 1, 128] - - [653, 1042.72] + - [727, 1042.72] - - [4288, 448, 1, 128] - - [729, 3698.82] + - [803, 3698.82] - - [5056, 256, 1, 1280] - - [746, 7058.5] + - [820, 7058.5] - - [1856, 1408, 1, 3328] - - [743, 8348.35] + - [817, 8348.35] - - [128, 128, 1, 128] - - [653, 145.736] + - [727, 145.736] - - [1024, 4288, 1, 3328] - - [739, 8042.61] + - [813, 8042.61] - - [448, 2368, 1, 256] - - [751, 5935.0] + - [825, 5935.0] - - [1024, 4, 1, 128] - - [777, 15.93] + - [851, 15.93] - - [64, 1408, 1, 1280] - - [675, 3865.49] + - [749, 3865.49] - - [64, 6784, 1, 1280] - - [771, 5629.61] + - [845, 5629.61] - - [5056, 448, 1, 256] - - [741, 7637.91] + - [815, 7637.91] - - [2944, 2368, 1, 3328] - - [749, 9112.44] + - [823, 9112.44] - - [704, 4288, 1, 3328] - - [741, 7950.2] + - [815, 7950.2] - - [1408, 128, 1, 256] - - [741, 2898.17] + - [815, 2898.17] - - [1024, 1856, 1, 1280] - - [739, 8087.51] + - [813, 8087.51] - - [6784, 1856, 1, 256] - - [770, 7538.25] + - [844, 7538.25] - - [512, 48000, 1, 2816] - - [738, 9704.21] + - [812, 9704.21] - - [512, 3000, 1, 2816] - - [740, 7621.63] + - [814, 7621.63] - - [128, 2368, 1, 3328] - - [703, 6038.94] + - [777, 6038.94] - - [1024, 5888, 1, 256] - - [755, 8185.82] + - [829, 8185.82] - - [64, 2944, 1, 1280] - - [702, 4540.24] + - [776, 4540.24] - - [6784, 1408, 1, 256] - - [755, 8574.0] + - [829, 8574.0] - - [5056, 64, 1, 3328] - - [703, 6310.97] + - [777, 6310.97] - - [128, 704, 1, 128] - - [642, 696.618] + - [716, 696.618] - - [1408, 2368, 1, 256] - - [741, 4995.06] + - [815, 4995.06] - - [1408, 1408, 1, 256] - - [738, 7552.34] + - [812, 7552.34] - - [4, 64, 1, 128] - - [776, 1.90441] + - [850, 1.90441] - - [64, 128, 1, 1280] - - [715, 1272.64] + - [789, 1272.64] - - [1024, 8, 1, 500000] - - [623, 2013.23] + - [697, 2013.23] - - [4, 2368, 1, 128] - - [777, 49.9526] + - [851, 49.9526] - - [2368, 2368, 1, 128] - - [728, 4483.8] + - [802, 4483.8] - - [64, 5888, 1, 128] - - [645, 1957.67] + - [719, 1957.67] - - [5888, 4, 1, 3328] - - [778, 638.798] + - [852, 638.798] - - [6784, 1408, 1, 128] - - [723, 4715.61] + - [797, 4715.61] - - [1408, 5056, 1, 256] - - [755, 8557.67] + - [829, 8557.67] - - [512, 50176, 1, 128] - - [786, 8809.39] + - [860, 8809.39] - - [5056, 128, 1, 3328] - - [678, 6810.66] + - [752, 6810.66] - - [128, 128, 1, 1280] - - [712, 1899.69] + - [786, 1899.69] - - [512, 2, 1, 512] - - [632, 87.4813] + - [706, 87.4813] - - [448, 704, 1, 256] - - [751, 3765.97] + - [825, 3765.97] - - [4288, 3584, 1, 128] - - [736, 4563.77] + - [810, 4563.77] - - [2944, 128, 1, 3328] - - [678, 6507.45] + - [752, 6507.45] - - [128, 5056, 1, 1280] - - [741, 6557.85] + - [815, 6557.85] - - [3584, 5056, 1, 1280] - - [738, 9407.93] + - [812, 9407.93] - - [256, 448, 1, 1280] - - [702, 4096.1] + - [776, 4096.1] - - [704, 704, 1, 128] - - [728, 2374.31] + - [802, 2374.31] - - [5056, 4, 1, 128] - - [776, 125.52] + - [850, 125.52] - - [704, 256, 1, 1280] - - [751, 4016.23] + - [825, 4016.23] - - [64, 2368, 1, 3328] - - [708, 5159.29] + - [782, 5159.29] - - [1856, 1024, 1, 128] - - [728, 3356.47] + - [802, 3356.47] - - [1856, 64, 1, 128] - - [645, 945.644] + - [719, 945.644] - - [4096, 64, 1, 4096] - - [711, 6260.24] + - [785, 6260.24] - - [1024, 24000, 1, 1536] - - [755, 9368.5] + - [829, 9368.5] - - [704, 4288, 1, 256] - - [752, 7329.39] + - [826, 7329.39] - - [5888, 2368, 1, 1280] - - [741, 8624.71] + - [815, 8624.71] - - [6784, 1856, 1, 3328] - - [745, 9012.45] + - [819, 9012.45] - - [64, 128, 1, 256] - - [671, 374.591] + - [745, 374.591] - - [2368, 5888, 1, 1280] - - [739, 9045.76] + - [813, 9045.76] - - [5888, 256, 1, 1280] - - [756, 7999.17] + - [830, 7999.17] - - [4, 5888, 1, 1280] - - [689, 615.839] + - [763, 615.839] - - [704, 128, 1, 128] - - [645, 693.269] + - [719, 693.269] - - [1024, 4, 1, 1280] - - [784, 372.464] + - [858, 372.464] - - [2368, 1856, 1, 3328] - - [756, 8246.91] + - [830, 8246.91] - - [2368, 128, 1, 128] - - [646, 1963.53] + - [720, 1963.53] - - [2944, 704, 1, 256] - - [756, 7116.24] + - [830, 7116.24] - - [5056, 128, 1, 128] - - [649, 2519.49] + - [723, 2519.49] - - [2368, 1024, 1, 3328] - - [741, 7959.13] + - [815, 7959.13] - - [35, 700, 1, 2048] - - [637, 1766.86] + - [711, 1766.86] - - [256, 704, 1, 3328] - - [741, 4296.56] + - [815, 4296.56] - - [704, 3584, 1, 256] - - [740, 7441.61] + - [814, 7441.61] - - [704, 2944, 1, 3328] - - [757, 7195.81] + - [831, 7195.81] - - [6784, 1024, 1, 128] - - [728, 4509.18] + - [802, 4509.18] - - [256, 448, 1, 128] - - [653, 838.003] + - [727, 838.003] - - [448, 1024, 1, 3328] - - [751, 6515.65] + - [825, 6515.65] - - [2944, 1024, 1, 3328] - - [746, 8751.63] + - [820, 8751.63] - - [2944, 5056, 1, 128] - - [723, 4799.73] + - [797, 4799.73] - - [2368, 256, 1, 256] - - [740, 4754.67] + - [814, 4754.67] - - [1408, 6784, 1, 256] - - [768, 7477.09] + - [842, 7477.09] - - [6784, 1408, 1, 3328] - - [746, 8968.57] + - [820, 8968.57] - - [4288, 6784, 1, 128] - - [721, 4455.74] + - [795, 4455.74] - - [1408, 2944, 1, 128] - - [733, 3862.79] + - [807, 3862.79] - - [704, 64, 1, 256] - - [672, 1441.89] + - [746, 1441.89] - - [3072, 4, 1, 1024] - - [690, 711.803] + - [764, 711.803] - - [256, 2368, 1, 3328] - - [765, 5199.73] + - [839, 5199.73] - - [6784, 2944, 1, 1280] - - [749, 8914.45] + - [823, 8914.45] - - [4288, 1856, 1, 128] - - [729, 4683.3] + - [803, 4683.3] - - [1856, 2944, 1, 128] - - [723, 4589.34] + - [797, 4589.34] - - [6784, 448, 1, 128] - - [723, 3918.53] + - [797, 3918.53] - - [64, 3584, 1, 128] - - [654, 1468.11] + - [728, 1468.11] - - [448, 5056, 1, 1280] - - [746, 7561.4] + - [820, 7561.4] - - [4288, 5056, 1, 1280] - - [738, 9304.11] + - [812, 9304.11] - - [2368, 1856, 1, 128] - - [728, 4322.17] + - [802, 4322.17] - - [128, 448, 1, 1280] - - [708, 3336.48] + - [782, 3336.48] - - [4288, 704, 1, 256] - - [751, 7834.65] + - [825, 7834.65] - - [256, 3584, 1, 128] - - [724, 2500.96] + - [798, 2500.96] - - [5888, 704, 1, 256] - - [770, 7244.49] + - [844, 7244.49] - - [3584, 1024, 1, 128] - - [735, 3169.03] + - [809, 3169.03] - - [256, 5888, 1, 3328] - - [756, 7763.47] + - [830, 7763.47] - - [1408, 4288, 1, 3328] - - [738, 9273.8] + - [812, 9273.8] - - [6784, 4288, 1, 256] - - [746, 8825.2] + - [820, 8825.2] - - [4288, 256, 1, 128] - - [725, 2621.54] + - [799, 2621.54] - - [448, 1856, 1, 3328] - - [766, 5859.8] + - [840, 5859.8] - - [5888, 256, 1, 256] - - [756, 7124.84] + - [830, 7124.84] - - [1024, 4, 1, 500000] - - [621, 1030.2] + - [695, 1030.2] - - [6784, 1024, 1, 1280] - - [738, 9083.11] + - [812, 9083.11] - - [5888, 1024, 1, 128] - - [725, 4297.16] + - [799, 4297.16] - - [1024, 128, 1, 256] - - [741, 2086.82] + - [815, 2086.82] - - [512, 16, 1, 500000] - - [622, 3921.96] + - [696, 3921.96] - - [128, 64, 1, 3328] - - [712, 1969.97] + - [786, 1969.97] - - [448, 64, 1, 256] - - [697, 1092.37] + - [771, 1092.37] - - [2368, 256, 1, 128] - - [728, 2174.84] + - [802, 2174.84] - - [6784, 3584, 1, 1280] - - [738, 9558.82] + - [812, 9558.82] - - [1024, 6784, 1, 1280] - - [747, 8637.72] + - [821, 8637.72] - - [2944, 64, 1, 1280] - - [669, 4770.13] + - [743, 4770.13] - - [1408, 2944, 1, 1280] - - [738, 9238.47] + - [812, 9238.47] - - [256, 1856, 1, 256] - - [764, 4498.43] + - [838, 4498.43] - - [1408, 2368, 1, 3328] - - [746, 8344.97] + - [820, 8344.97] - - [2944, 4, 1, 3328] - - [781, 661.209] + - [855, 661.209] - - [128, 1408, 1, 3328] - - [709, 5641.42] + - [783, 5641.42] - - [2944, 1856, 1, 128] - - [723, 4488.04] + - [797, 4488.04] - - [256, 2944, 1, 128] - - [733, 2233.18] + - [807, 2233.18] - - [256, 6784, 1, 128] - - [722, 3139.9] + - [796, 3139.9] - - [2368, 4, 1, 128] - - [777, 38.7612] + - [851, 38.7612] - - [1408, 256, 1, 3328] - - [773, 4927.67] + - [847, 4927.67] - - [1856, 4, 1, 128] - - [777, 42.3719] + - [851, 42.3719] - - [1024, 16, 1, 512] - - [689, 1115.61] + - [763, 1115.61] - - [5056, 6784, 1, 128] - - [724, 4963.45] + - [798, 4963.45] - - [4288, 5056, 1, 128] - - [722, 4928.09] + - [796, 4928.09] - - [1856, 5888, 1, 128] - - [729, 4865.15] + - [803, 4865.15] - - [7680, 2, 1, 2560] - - [665, 499.612] + - [739, 499.612] - - [3584, 1856, 1, 256] - - [755, 7978.38] + - [829, 7978.38] - - [4288, 3584, 1, 1280] - - [755, 7852.26] + - [829, 7852.26] - - [2368, 448, 1, 256] - - [770, 5238.93] + - [844, 5238.93] - - [4288, 256, 1, 3328] - - [741, 6751.34] + - [815, 6751.34] - - [1856, 704, 1, 128] - - [723, 3525.56] + - [797, 3525.56] - - [1408, 64, 1, 256] - - [682, 1884.8] + - [756, 1884.8] - - [64, 1856, 1, 128] - - [659, 888.205] + - [733, 888.205] - - [4, 256, 1, 128] - - [776, 7.38178] + - [850, 7.38178] - - [512, 16, 1, 512] - - [689, 663.756] + - [763, 663.756] - - [704, 5888, 1, 128] - - [723, 4424.55] + - [797, 4424.55] - - [6784, 3584, 1, 128] - - [725, 3823.4] + - [799, 3823.4] - - [1024, 64, 1, 256] - - [667, 1379.81] + - [741, 1379.81] - - [64, 2368, 1, 256] - - [741, 2424.93] + - [815, 2424.93] - - [5124, 1500, 1, 2048] - - [759, 8391.84] + - [833, 8391.84] - - [4288, 5056, 1, 3328] - - [745, 9274.14] + - [819, 9274.14] - - [4, 1856, 1, 1280] - - [689, 453.474] + - [763, 453.474] - - [4288, 128, 1, 128] - - [723, 2157.8] + - [797, 2157.8] - - [512, 2, 1, 500000] - - [633, 516.895] + - [707, 516.895] - - [1408, 1408, 1, 128] - - [724, 3600.49] + - [798, 3600.49] - - [7680, 16, 1, 2560] - - [704, 3542.59] + - [778, 3542.59] - - [1856, 128, 1, 128] - - [656, 1532.8] + - [730, 1532.8] - - [5056, 2368, 1, 256] - - [768, 7684.07] + - [842, 7684.07] - - [4288, 704, 1, 3328] - - [741, 7642.96] + - [815, 7642.96] - - [448, 3584, 1, 256] - - [751, 6734.07] + - [825, 6734.07] - - [2368, 64, 1, 1280] - - [702, 3962.24] + - [776, 3962.24] - - [2368, 1024, 1, 1280] - - [753, 7989.64] + - [827, 7989.64] - - [2944, 1408, 1, 3328] - - [756, 8954.66] + - [830, 8954.66] - - [6144, 1500, 1, 2560] - - [774, 8170.07] + - [848, 8170.07] - - [4224, 1, 1, 128] - - [705, 76.9] + - [779, 76.9] - - [1024, 1408, 1, 3328] - - [771, 6961.38] + - [845, 6961.38] - - [2944, 5888, 1, 1280] - - [752, 8797.53] + - [826, 8797.53] - - [8448, 2, 1, 2816] - - [627, 496.958] + - [701, 496.958] - - [1408, 4, 1, 1280] - - [782, 471.891] + - [856, 471.891] - - [5888, 3584, 1, 256] - - [759, 8246.3] + - [833, 8246.3] - - [2368, 5056, 1, 128] - - [722, 4906.9] + - [796, 4906.9] - - [1408, 1856, 1, 3328] - - [746, 9006.8] + - [820, 9006.8] - - [4, 4, 1, 3328] - - [694, 5.83793] + - [768, 5.83793] - - [5888, 5056, 1, 3328] - - [759, 8545.1] + - [833, 8545.1] - - [7680, 6000, 1, 2560] - - [752, 7996.0] + - [826, 7996.0] - - [6784, 1408, 1, 1280] - - [746, 8888.13] + - [820, 8888.13] - - [4, 1024, 1, 1280] - - [694, 302.109] + - [768, 302.109] - - [512, 3000, 1, 2560] - - [746, 7809.43] + - [820, 7809.43] - - [704, 2944, 1, 256] - - [751, 4909.24] + - [825, 4909.24] - - [4288, 64, 1, 256] - - [751, 3264.72] + - [825, 3264.72] - - [6784, 5888, 1, 3328] - - [759, 9544.52] + - [833, 9544.52] - - [2368, 4288, 1, 128] - - [722, 4873.03] + - [796, 4873.03] - - [64, 4288, 1, 1280] - - [708, 4656.42] + - [782, 4656.42] - - [6784, 64, 1, 1280] - - [741, 6230.43] + - [815, 6230.43] - - [3584, 128, 1, 128] - - [649, 2315.57] + - [723, 2315.57] - - [1024, 6784, 1, 128] - - [723, 3758.94] + - [797, 3758.94] - - [1024, 1500, 1, 1536] - - [772, 6972.0] + - [846, 6972.0] - - [1408, 64, 1, 3328] - - [675, 5079.58] + - [749, 5079.58] - - [6784, 4, 1, 256] - - [661, 487.938] + - [735, 487.938] - - [1408, 1408, 1, 1280] - - [774, 7423.31] + - [848, 7423.31] - - [256, 2368, 1, 256] - - [741, 4986.9] + - [815, 4986.9] - - [3072, 3000, 1, 1024] - - [743, 7844.01] + - [817, 7844.01] - - [448, 4288, 1, 3328] - - [742, 7204.79] + - [816, 7204.79] - - [2368, 1408, 1, 256] - - [774, 5897.96] + - [848, 5897.96] - - [704, 2368, 1, 256] - - [741, 7000.93] + - [815, 7000.93] - - [1024, 24000, 1, 2560] - - [768, 8562.31] + - [842, 8562.31] - - [2944, 448, 1, 1280] - - [756, 7155.93] + - [830, 7155.93] - - [5888, 2368, 1, 3328] - - [755, 9252.42] + - [829, 9252.42] - - [1024, 256, 1, 128] - - [737, 1255.88] + - [811, 1255.88] - - [5124, 9124, 1, 1760] - - [749, 9168.49] + - [823, 9168.49] - - [448, 1408, 1, 1280] - - [741, 6150.34] + - [815, 6150.34] - - [448, 1856, 1, 1280] - - [756, 6489.76] + - [830, 6489.76] - - [4288, 448, 1, 1280] - - [771, 6887.02] + - [845, 6887.02] - - [5888, 704, 1, 3328] - - [751, 8230.64] + - [825, 8230.64] - - [4, 1856, 1, 128] - - [777, 27.0964] + - [851, 27.0964] - - [5056, 256, 1, 128] - - [722, 3469.01] + - [796, 3469.01] - - [1856, 256, 1, 128] - - [723, 2534.16] + - [797, 2534.16] - - [128, 2368, 1, 256] - - [741, 3660.22] + - [815, 3660.22] - - [704, 4, 1, 256] - - [689, 134.596] + - [763, 134.596] - - [1024, 6784, 1, 3328] - - [743, 8482.75] + - [817, 8482.75] - - [1408, 5888, 1, 128] - - [723, 4644.52] + - [797, 4644.52] - - [4288, 4, 1, 128] - - [776, 35.8799] + - [850, 35.8799] - - [512, 3136, 1, 2048] - - [788, 6386.69] + - [862, 6386.69] - - [1408, 1024, 1, 256] - - [741, 5440.82] + - [815, 5440.82] - - [128, 64, 1, 256] - - [671, 380.019] + - [745, 380.019] - - [8448, 1500, 1, 2816] - - [738, 9155.92] + - [812, 9155.92] - - [256, 704, 1, 128] - - [723, 895.623] + - [797, 895.623] - - [2560, 7000, 1, 2560] - - [750, 8565.66] + - [824, 8565.66] - - [5888, 64, 1, 1280] - - [765, 5007.83] + - [839, 5007.83] - - [128, 4, 1, 3328] - - [784, 165.21] + - [858, 165.21] - - [5056, 6784, 1, 1280] - - [749, 9331.48] + - [823, 9331.48] - - [1024, 448, 1, 1280] - - [751, 6501.46] + - [825, 6501.46] - - [704, 5056, 1, 3328] - - [738, 8090.13] + - [812, 8090.13] - - [128, 5056, 1, 256] - - [751, 5537.37] + - [825, 5537.37] - - [3584, 5056, 1, 3328] - - [747, 8633.24] + - [821, 8633.24] - - [1856, 4, 1, 3328] - - [785, 582.814] + - [859, 582.814] - - [4, 2944, 1, 128] - - [776, 114.292] + - [850, 114.292] - - [2368, 2944, 1, 3328] - - [755, 8749.55] + - [829, 8749.55] - - [448, 448, 1, 1280] - - [679, 4694.93] + - [753, 4694.93] - - [128, 4, 1, 128] - - [776, 4.94734] + - [850, 4.94734] - - [2368, 3584, 1, 256] - - [755, 8418.59] + - [829, 8418.59] - - [4608, 3000, 1, 1536] - - [745, 9076.47] + - [819, 9076.47] - - [1024, 256, 1, 1280] - - [751, 5562.84] + - [825, 5562.84] - - [5056, 3584, 1, 1280] - - [745, 8365.09] + - [819, 8365.09] - - [5124, 9124, 1, 4096] - - [755, 8648.58] + - [829, 8648.58] - - [7680, 48000, 1, 2560] - - [749, 4098.26] + - [823, 4098.26] - - [1856, 704, 1, 1280] - - [741, 8141.04] + - [815, 8141.04] - - [1856, 2944, 1, 1280] - - [743, 8214.4] + - [817, 8214.4] - - [4608, 1500, 1, 1536] - - [751, 8424.53] + - [825, 8424.53] - - [1024, 48000, 1, 2816] - - [742, 8513.18] + - [816, 8513.18] - - [5124, 9124, 1, 2560] - - [759, 8641.24] + - [833, 8641.24] - - [128, 1024, 1, 256] - - [673, 2356.45] + - [747, 2356.45] - - [2944, 1408, 1, 256] - - [755, 8254.29] + - [829, 8254.29] - - [4288, 1408, 1, 3328] - - [749, 9138.49] + - [823, 9138.49] - - [3584, 64, 1, 3328] - - [662, 5629.62] + - [736, 5629.62] - - [5888, 2944, 1, 128] - - [723, 4119.33] + - [797, 4119.33] - - [2944, 1024, 1, 128] - - [725, 4002.96] + - [799, 4002.96] - - [128, 1, 1, 1024] - - [719, 20.0805] + - [793, 20.0805] - - [5124, 700, 1, 2048] - - [756, 7653.84] + - [830, 7653.84] - - [4, 4288, 1, 1280] - - [689, 587.749] + - [763, 587.749] - - [6784, 5056, 1, 128] - - [728, 4855.85] + - [802, 4855.85] - - [256, 1024, 1, 3328] - - [751, 6116.28] + - [825, 6116.28] - - [3584, 4, 1, 256] - - [663, 395.576] + - [737, 395.576] - - [1856, 64, 1, 3328] - - [678, 5732.6] + - [752, 5732.6] - - [4, 128, 1, 3328] - - [784, 162.689] + - [858, 162.689] - - [256, 12544, 1, 1024] - - [788, 7628.92] + - [862, 7628.92] - - [5888, 1408, 1, 3328] - - [749, 9524.43] + - [823, 9524.43] - - [448, 2944, 1, 128] - - [723, 3163.91] + - [797, 3163.91] - - [2368, 1856, 1, 256] - - [751, 8167.36] + - [825, 8167.36] - - [256, 5056, 1, 256] - - [741, 7292.13] + - [815, 7292.13] - - [5056, 5056, 1, 128] - - [729, 5043.99] + - [803, 5043.99] - - [448, 3584, 1, 3328] - - [746, 6839.56] + - [820, 6839.56] - - [4, 5056, 1, 3328] - - [694, 639.886] + - [768, 639.886] - - [256, 256, 1, 128] - - [653, 554.902] + - [727, 554.902] - - [5888, 256, 1, 128] - - [725, 3562.47] + - [799, 3562.47] - - [4, 5056, 1, 128] - - [776, 149.907] + - [850, 149.907] - - [448, 256, 1, 256] - - [672, 2121.5] + - [746, 2121.5] - - [704, 4, 1, 3328] - - [782, 455.919] + - [856, 455.919] - - [1408, 256, 1, 256] - - [741, 4352.68] + - [815, 4352.68] - - [3584, 1856, 1, 128] - - [732, 3933.23] + - [806, 3933.23] - - [4288, 4288, 1, 128] - - [723, 4888.61] + - [797, 4888.61] - - [1856, 1024, 1, 3328] - - [759, 8242.64] + - [833, 8242.64] - - [1856, 4288, 1, 128] - - [728, 4647.4] + - [802, 4647.4] - - [1024, 6000, 1, 2560] - - [753, 8526.75] + - [827, 8526.75] - - [1024, 5056, 1, 256] - - [738, 7343.83] + - [812, 7343.83] - - [5056, 5888, 1, 128] - - [727, 4053.5] + - [801, 4053.5] - - [2368, 1408, 1, 3328] - - [741, 8466.2] + - [815, 8466.2] - - [1024, 48000, 1, 1536] - - [759, 9487.74] + - [833, 9487.74] - - [5888, 448, 1, 256] - - [772, 6081.54] + - [846, 6081.54] - - [5888, 6784, 1, 128] - - [724, 4820.27] + - [798, 4820.27] - - [2368, 4, 1, 3328] - - [783, 620.628] + - [857, 620.628] - - [6784, 5056, 1, 1280] - - [768, 8525.5] + - [842, 8525.5] - - [5056, 704, 1, 1280] - - [738, 7933.06] + - [812, 7933.06] - - [1024, 48000, 1, 2560] - - [759, 8877.94] + - [833, 8877.94] - - [4608, 32, 1, 1536] - - [688, 3556.83] + - [762, 3556.83] - - [1024, 2368, 1, 128] - - [731, 2943.75] + - [805, 2943.75] - - [128, 704, 1, 256] - - [672, 2059.8] + - [746, 2059.8] - - [2368, 448, 1, 3328] - - [751, 5290.42] + - [825, 5290.42] - - [128, 5888, 1, 3328] - - [751, 7764.43] + - [825, 7764.43] - - [448, 128, 1, 1280] - - [702, 3373.28] + - [776, 3373.28] - - [6784, 4, 1, 3328] - - [661, 676.063] + - [735, 676.063] - - [4288, 4, 1, 1280] - - [694, 564.775] + - [768, 564.775] - - [1024, 64, 1, 3328] - - [708, 4293.48] + - [782, 4293.48] - - [3072, 48000, 1, 1024] - - [758, 7826.51] + - [832, 7826.51] - - [256, 4, 1, 128] - - [777, 4.93304] + - [851, 4.93304] - - [1024, 5888, 1, 128] - - [736, 3610.46] + - [810, 3610.46] - - [3584, 5888, 1, 128] - - [724, 4722.35] + - [798, 4722.35] - - [5056, 5888, 1, 256] - - [759, 9159.11] + - [833, 9159.11] - - [2368, 1024, 1, 256] - - [751, 7482.71] + - [825, 7482.71] - - [2944, 1856, 1, 256] - - [755, 8209.0] + - [829, 8209.0] - - [1856, 6784, 1, 1280] - - [751, 8205.43] + - [825, 8205.43] - - [64, 5056, 1, 128] - - [646, 2079.35] + - [720, 2079.35] - - [64, 6784, 1, 128] - - [646, 2437.58] + - [720, 2437.58] - - [448, 704, 1, 128] - - [722, 1506.45] + - [796, 1506.45] - - [4, 1024, 1, 128] - - [777, 17.3463] + - [851, 17.3463] - - [1408, 448, 1, 256] - - [741, 5545.45] + - [815, 5545.45] - - [1408, 704, 1, 128] - - [727, 2931.65] + - [801, 2931.65] - - [64, 256, 1, 3328] - - [713, 2816.52] + - [787, 2816.52] - - [8448, 3000, 1, 2816] - - [747, 8872.99] + - [821, 8872.99] - - [6784, 448, 1, 3328] - - [741, 7555.48] + - [815, 7555.48] - - [5056, 1856, 1, 1280] - - [739, 8652.36] + - [813, 8652.36] - - [1408, 1024, 1, 3328] - - [743, 7781.42] + - [817, 7781.42] - - [2368, 256, 1, 3328] - - [747, 5392.06] + - [821, 5392.06] - - [7680, 1500, 1, 2560] - - [745, 8919.72] + - [819, 8919.72] - - [5888, 3584, 1, 1280] - - [745, 9235.85] + - [819, 9235.85] - - [1856, 3584, 1, 3328] - - [756, 8348.83] + - [830, 8348.83] - - [5888, 128, 1, 1280] - - [741, 5928.61] + - [815, 5928.61] - - [1024, 2944, 1, 256] - - [772, 6630.27] + - [846, 6630.27] - - [448, 6784, 1, 1280] - - [753, 8332.45] + - [827, 8332.45] - - [256, 3584, 1, 1280] - - [743, 7140.19] + - [817, 7140.19] - - [448, 128, 1, 128] - - [645, 552.813] + - [719, 552.813] - - [704, 5056, 1, 256] - - [751, 7959.68] + - [825, 7959.68] - - [3584, 1024, 1, 3328] - - [743, 8386.84] + - [817, 8386.84] - - [2944, 1856, 1, 1280] - - [759, 7670.29] + - [833, 7670.29] - - [128, 256, 1, 128] - - [660, 258.37] + - [734, 258.37] - - [5056, 256, 1, 256] - - [751, 5736.77] + - [825, 5736.77] - - [2944, 4288, 1, 3328] - - [738, 8730.8] + - [812, 8730.8] - - [2368, 3584, 1, 3328] - - [740, 8437.71] + - [814, 8437.71] - - [2944, 704, 1, 1280] - - [751, 8342.53] + - [825, 8342.53] - - [128, 4, 1, 256] - - [671, 24.9242] + - [745, 24.9242] - - [2944, 3584, 1, 1280] - - [753, 8322.11] + - [827, 8322.11] - - [1856, 5888, 1, 1280] - - [738, 8911.91] + - [812, 8911.91] - - [256, 256, 1, 1280] - - [702, 3653.67] + - [776, 3653.67] - - [4608, 24000, 1, 1536] - - [752, 8931.06] + - [826, 8931.06] - - [4288, 1408, 1, 256] - - [739, 8338.45] + - [813, 8338.45] - - [3584, 64, 1, 256] - - [751, 3414.07] + - [825, 3414.07] - - [64, 1856, 1, 3328] - - [678, 5460.23] + - [752, 5460.23] - - [256, 1408, 1, 128] - - [722, 1424.09] + - [796, 1424.09] - - [5888, 1408, 1, 128] - - [733, 4177.88] + - [807, 4177.88] - - [4288, 2368, 1, 1280] - - [742, 8596.05] + - [816, 8596.05] - - [4, 4288, 1, 256] - - [778, 370.954] + - [852, 370.954] - - [256, 4288, 1, 128] - - [723, 2907.99] + - [797, 2907.99] - - [256, 128, 1, 3328] - - [716, 3644.88] + - [790, 3644.88] - - [512, 8, 1, 500000] - - [628, 2025.89] + - [702, 2025.89] - - [6784, 2368, 1, 256] - - [741, 8470.41] + - [815, 8470.41] - - [5888, 128, 1, 128] - - [646, 2604.55] + - [720, 2604.55] - - [1408, 448, 1, 3328] - - [751, 6540.62] + - [825, 6540.62] - - [1024, 24000, 1, 2816] - - [768, 8364.03] + - [842, 8364.03] - - [704, 1024, 1, 1280] - - [751, 7277.28] + - [825, 7277.28] - - [1856, 256, 1, 3328] - - [741, 7039.14] + - [815, 7039.14] - - [1856, 2944, 1, 256] - - [750, 8151.59] + - [824, 8151.59] - - [5056, 1024, 1, 128] - - [724, 4422.82] + - [798, 4422.82] - - [64, 5888, 1, 1280] - - [702, 4854.62] + - [776, 4854.62] - - [7680, 3000, 1, 2560] - - [755, 8789.57] + - [829, 8789.57] - - [4224, 1500, 1, 176] - - [751, 7902.14] + - [825, 7902.14] - - [5124, 700, 1, 2560] - - [741, 8232.59] + - [815, 8232.59] - - [6784, 256, 1, 128] - - [722, 3548.92] + - [796, 3548.92] - - [5888, 704, 1, 128] - - [729, 3959.65] + - [803, 3959.65] - - [6784, 64, 1, 128] - - [657, 2150.82] + - [731, 2150.82] - - [4, 448, 1, 1280] - - [782, 268.063] + - [856, 268.063] - - [1024, 4288, 1, 1280] - - [756, 8363.72] + - [830, 8363.72] - - [2368, 5056, 1, 3328] - - [755, 8581.85] + - [829, 8581.85] - - [448, 4, 1, 128] - - [776, 16.8673] + - [850, 16.8673] - - [4, 256, 1, 3328] - - [785, 201.988] + - [859, 201.988] - - [4288, 1024, 1, 3328] - - [751, 8567.72] + - [825, 8567.72] - - [6144, 48000, 1, 2560] - - [759, 3751.68] + - [833, 3751.68] - - [1024, 5056, 1, 3328] - - [738, 9440.66] + - [812, 9440.66] - - [1024, 1856, 1, 3328] - - [759, 8244.36] + - [833, 8244.36] - - [704, 704, 1, 1280] - - [751, 5529.99] + - [825, 5529.99] - - [128, 2368, 1, 1280] - - [708, 5062.38] + - [782, 5062.38] - - [3584, 4, 1, 128] - - [777, 61.5949] + - [851, 61.5949] - - [3584, 256, 1, 1280] - - [775, 6260.24] + - [849, 6260.24] - - [4, 128, 1, 128] - - [776, 1.2587] + - [850, 1.2587] - - [128, 4288, 1, 3328] - - [687, 6186.15] + - [761, 6186.15] - - [5124, 1500, 1, 2560] - - [755, 8432.62] + - [829, 8432.62] - - [3584, 128, 1, 1280] - - [741, 6547.85] + - [815, 6547.85] - - [4, 256, 1, 1280] - - [694, 180.144] + - [768, 180.144] - - [128, 704, 1, 3328] - - [666, 5177.81] + - [740, 5177.81] - - [4288, 6784, 1, 256] - - [739, 9005.34] + - [813, 9005.34] - - [3584, 2944, 1, 3328] - - [756, 8872.27] + - [830, 8872.27] - - [128, 1856, 1, 256] - - [741, 3690.48] + - [815, 3690.48] - - [64, 4288, 1, 256] - - [741, 3007.57] + - [815, 3007.57] - - [4, 3584, 1, 3328] - - [671, 639.99] + - [745, 639.99] - - [64, 4, 1, 3328] - - [785, 98.7074] + - [859, 98.7074] - - [4, 64, 1, 3328] - - [785, 91.9069] + - [859, 91.9069] - - [35, 700, 1, 2560] - - [639, 2397.65] + - [713, 2397.65] - - [5888, 2944, 1, 256] - - [749, 9031.28] + - [823, 9031.28] - - [4, 2368, 1, 256] - - [689, 256.968] + - [763, 256.968] - - [1856, 64, 1, 256] - - [673, 2222.96] + - [747, 2222.96] - - [5056, 128, 1, 1280] - - [741, 6557.85] + - [815, 6557.85] - - [448, 4288, 1, 1280] - - [765, 6891.66] + - [839, 6891.66] - - [256, 4288, 1, 256] - - [741, 6250.51] + - [815, 6250.51] - - [1024, 4288, 1, 128] - - [725, 3951.41] + - [799, 3951.41] - - [4, 1024, 1, 256] - - [689, 182.144] + - [763, 182.144] - - [5056, 4288, 1, 256] - - [745, 8933.43] + - [819, 8933.43] - - [1024, 448, 1, 256] - - [751, 4573.33] + - [825, 4573.33] - - [1024, 3584, 1, 256] - - [746, 7447.18] + - [820, 7447.18] - - [2944, 128, 1, 1280] - - [751, 5417.27] + - [825, 5417.27] - - [49, 2048, 64, 512] - - [794, 5916.91] + - [868, 5916.91] - - [2560, 32, 1, 2560] - - [688, 4076.99] + - [762, 4076.99] - - [64, 256, 1, 256] - - [705, 689.953] + - [779, 689.953] - - [1024, 4, 1, 512] - - [697, 288.17] + - [771, 288.17] - - [128, 2368, 1, 128] - - [651, 1809.68] + - [725, 1809.68] - - [256, 704, 1, 1280] - - [741, 4033.08] + - [815, 4033.08] - - [64, 2368, 1, 128] - - [642, 1165.88] + - [716, 1165.88] - - [176, 1500, 1, 1408] - - [669, 4922.13] + - [743, 4922.13] - - [448, 5888, 1, 1280] - - [751, 7550.21] + - [825, 7550.21] - - [512, 3000, 1, 2048] - - [773, 6562.44] + - [847, 6562.44] - - [5056, 448, 1, 128] - - [723, 3947.97] + - [797, 3947.97] - - [4288, 704, 1, 1280] - - [741, 8243.82] + - [815, 8243.82] - - [3584, 2944, 1, 128] - - [733, 4284.88] + - [807, 4284.88] - - [6784, 256, 1, 1280] - - [741, 7955.21] + - [815, 7955.21] - - [256, 2944, 1, 1280] - - [771, 6691.9] + - [845, 6691.9] - - [2560, 128, 1, 2560] - - [709, 5347.23] + - [783, 5347.23] - - [2368, 5888, 1, 3328] - - [746, 8919.07] + - [820, 8919.07] - - [4, 64, 1, 256] - - [694, 13.1032] + - [768, 13.1032] - - [704, 1024, 1, 3328] - - [771, 6648.12] + - [845, 6648.12] - - [2368, 1856, 1, 1280] - - [757, 8016.51] + - [831, 8016.51] - - [448, 5056, 1, 3328] - - [741, 8231.73] + - [815, 8231.73] - - [128, 448, 1, 128] - - [650, 441.208] + - [724, 441.208] - - [128, 6784, 1, 256] - - [751, 5850.05] + - [825, 5850.05] - - [512, 4, 1, 500000] - - [631, 1027.14] + - [705, 1027.14] - - [3584, 4288, 1, 128] - - [727, 4260.9] + - [801, 4260.9] - - [64, 448, 1, 128] - - [650, 253.554] + - [724, 253.554] - - [1024, 6000, 1, 2816] - - [755, 8886.14] + - [829, 8886.14] - - [5888, 4288, 1, 3328] - - [755, 8968.16] + - [829, 8968.16] - - [2368, 704, 1, 256] - - [771, 4663.24] + - [845, 4663.24] - - [256, 1856, 1, 3328] - - [743, 6480.63] + - [817, 6480.63] - - [1856, 128, 1, 256] - - [741, 3726.66] + - [815, 3726.66] - - [6784, 128, 1, 128] - - [644, 2824.01] + - [718, 2824.01] - - [3584, 1408, 1, 128] - - [727, 3666.78] + - [801, 3666.78] - - [1856, 5056, 1, 1280] - - [738, 8651.36] + - [812, 8651.36] - - [2944, 1024, 1, 1280] - - [749, 8765.21] + - [823, 8765.21] - - [5056, 4, 1, 256] - - [663, 428.688] + - [737, 428.688] - - [3584, 5888, 1, 3328] - - [749, 9347.75] + - [823, 9347.75] - - [2368, 4288, 1, 256] - - [759, 8013.1] + - [833, 8013.1] - - [1024, 2368, 1, 3328] - - [746, 8119.29] + - [820, 8119.29] - - [128, 3584, 1, 128] - - [646, 2584.62] + - [720, 2584.62] - - [704, 1408, 1, 256] - - [751, 6792.27] + - [825, 6792.27] - - [4096, 128, 1, 4096] - - [773, 6624.84] + - [847, 6624.84] - - [1024, 2944, 1, 128] - - [725, 3771.37] + - [799, 3771.37] - - [1024, 3584, 1, 1280] - - [746, 8952.71] + - [820, 8952.71] - - [4288, 5888, 1, 3328] - - [759, 9048.05] + - [833, 9048.05] - - [4288, 4, 1, 3328] - - [664, 615.206] + - [738, 615.206] - - [4608, 16, 1, 1536] - - [668, 2894.94] + - [742, 2894.94] - - [5888, 64, 1, 128] - - [655, 1827.16] + - [729, 1827.16] - - [4, 5888, 1, 128] - - [776, 179.544] + - [850, 179.544] - - [1024, 2944, 1, 3328] - - [747, 8298.77] + - [821, 8298.77] - - [2048, 64, 1, 2048] - - [676, 4963.77] + - [750, 4963.77] - - [6144, 2, 1, 2560] - - [665, 477.88] + - [739, 477.88] - - [256, 6784, 1, 1280] - - [739, 7491.94] + - [813, 7491.94] - - [1856, 3584, 1, 256] - - [751, 7580.6] + - [825, 7580.6] - - [128, 448, 1, 3328] - - [702, 4417.71] + - [776, 4417.71] - - [6784, 1856, 1, 128] - - [730, 4621.74] + - [804, 4621.74] - - [1024, 1500, 1, 2048] - - [751, 6284.5] + - [825, 6284.5] - - [5056, 128, 1, 256] - - [751, 5705.16] + - [825, 5705.16] - - [512, 24000, 1, 2816] - - [738, 8919.85] + - [812, 8919.85] - - [256, 5888, 1, 1280] - - [753, 7978.0] + - [827, 7978.0] - - [4, 128, 1, 1280] - - [694, 94.2609] + - [768, 94.2609] - - [4288, 6784, 1, 3328] - - [759, 9012.58] + - [833, 9012.58] - - [6784, 128, 1, 1280] - - [743, 6807.35] + - [817, 6807.35] - - [64, 1408, 1, 256] - - [672, 2045.19] + - [746, 2045.19] - - [2368, 1408, 1, 128] - - [723, 4340.73] + - [797, 4340.73] - - [1856, 448, 1, 256] - - [772, 3639.99] + - [846, 3639.99] - - [1408, 1024, 1, 128] - - [731, 3417.68] + - [805, 3417.68] - - [128, 64, 1, 128] - - [652, 68.7241] + - [726, 68.7241] - - [6784, 3584, 1, 3328] - - [749, 9425.63] + - [823, 9425.63] - - [1760, 7000, 1, 1760] - - [746, 8780.41] + - [820, 8780.41] - - [1024, 704, 1, 3328] - - [763, 5644.6] + - [837, 5644.6] - - [64, 64, 1, 128] - - [642, 38.2023] + - [716, 38.2023] - - [2368, 5056, 1, 1280] - - [760, 8462.41] + - [834, 8462.41] - - [64, 4, 1, 1280] - - [694, 46.6455] + - [768, 46.6455] - - [1408, 2368, 1, 1280] - - [746, 8235.08] + - [820, 8235.08] - - [128, 1408, 1, 1280] - - [708, 4491.66] + - [782, 4491.66] - - [1024, 1, 1, 512] - - [712, 82.02] + - [786, 82.02] - - [4, 1408, 1, 128] - - [776, 56.42] + - [850, 56.42] - - [704, 4288, 1, 128] - - [730, 3942.96] + - [804, 3942.96] - - [128, 1856, 1, 3328] - - [696, 6111.93] + - [770, 6111.93] - - [2944, 2944, 1, 256] - - [755, 8640.22] + - [829, 8640.22] - - [2944, 4, 1, 1280] - - [689, 554.265] + - [763, 554.265] - - [5888, 4, 1, 256] - - [671, 435.744] + - [745, 435.744] - - [6784, 256, 1, 256] - - [751, 7025.96] + - [825, 7025.96] - - [256, 5056, 1, 3328] - - [751, 8249.57] + - [825, 8249.57] - - [128, 4288, 1, 1280] - - [741, 5561.74] + - [815, 5561.74] - - [5056, 1856, 1, 128] - - [735, 3975.28] + - [809, 3975.28] - - [1024, 3000, 1, 1536] - - [756, 8544.54] + - [830, 8544.54] - - [5056, 1024, 1, 3328] - - [749, 9361.47] + - [823, 9361.47] - - [128, 128, 1, 256] - - [701, 699.151] + - [775, 699.151] - - [1760, 64, 1, 1760] - - [669, 4956.26] + - [743, 4956.26] - - [4288, 3584, 1, 3328] - - [769, 7506.18] + - [843, 7506.18] - - [448, 704, 1, 3328] - - [741, 4697.66] + - [815, 4697.66] - - [448, 448, 1, 128] - - [658, 1249.62] + - [732, 1249.62] - - [1024, 2368, 1, 1280] - - [751, 7756.44] + - [825, 7756.44] - - [1856, 704, 1, 3328] - - [751, 8340.66] + - [825, 8340.66] - - [512, 1500, 1, 2560] - - [753, 6041.39] + - [827, 6041.39] - - [5888, 6784, 1, 3328] - - [749, 9199.38] + - [823, 9199.38] - - [704, 4288, 1, 1280] - - [743, 8342.06] + - [817, 8342.06] - - [128, 50176, 1, 512] - - [789, 7589.48] + - [863, 7589.48] - - [704, 256, 1, 256] - - [741, 2912.81] + - [815, 2912.81] - - [1024, 48000, 1, 2048] - - [746, 8947.42] + - [820, 8947.42] - - [4288, 1024, 1, 128] - - [722, 4291.75] + - [796, 4291.75] - - [3136, 64, 128, 64] - - [804, 8175.16] - - - [784, 512, 64, 128] - - [802, 8378.44] - - - [3136, 256, 64, 64] - - [805, 8506.75] - - - [12544, 1024, 1, 256] - - [798, 8928.03] + - [878, 8175.16] - - [784, 128, 128, 512] - - [803, 8190.63] + - [877, 8190.63] - - [784, 512, 256, 128] - - [801, 8637.24] - - - [3136, 64, 64, 256] - - [800, 8783.03] - - - [3136, 512, 1, 2048] - - [797, 7298.42] - - - [12544, 256, 1, 1024] - - [809, 7667.35] - - - [3136, 2048, 1, 512] - - [808, 8447.32] + - [875, 8637.24] - - [3136, 256, 256, 64] - - [801, 8663.18] + - [875, 8663.18] - - [3136, 64, 128, 256] - - [799, 8943.56] - - - [784, 128, 64, 512] - - [807, 8006.37] + - [873, 8943.56] - - [3136, 64, 256, 64] - - [804, 8267.22] + - [878, 8267.22] - - [784, 512, 128, 128] - - [801, 8564.35] - - - [3136, 64, 64, 64] - - [804, 8009.45] + - [875, 8564.35] - - [784, 128, 256, 512] - - [805, 8377.16] + - [879, 8377.16] - - [3136, 64, 256, 256] - - [806, 9033.98] + - [880, 9033.98] - - [3136, 256, 128, 64] - - [801, 8624.56] + - [875, 8624.56] - - [1024, 256, 1, 1024] - - [827, 6331.13] + - [901, 6331.13] - - [1024, 512, 1, 2048] - - [826, 8100.14] + - [900, 8100.14] - - [512, 200, 1, 512] - - [835, 2861.93] + - [909, 2861.93] - - [4096, 256, 1, 2048] - - [818, 8812.82] + - [892, 8812.82] - - [4096, 512, 1, 1024] - - [828, 9068.87] + - [902, 9068.87] - - [1024, 200, 1, 1024] - - [827, 5110.12] + - [901, 5110.12] - - [1024, 512, 1, 1024] - - [820, 7785.35] + - [894, 7785.35] - - [2048, 256, 1, 4096] - - [830, 8438.81] + - [904, 8438.81] - - [2048, 768, 1, 512] - - [812, 8618.53] + - [886, 8618.53] - - [512, 256, 1, 1024] - - [832, 4835.03] + - [906, 4835.03] - - [512, 768, 1, 2048] - - [829, 6909.04] + - [903, 6909.04] - - [2048, 256, 1, 1024] - - [825, 7941.98] + - [899, 7941.98] - - [1024, 256, 1, 2048] - - [822, 6997.9] + - [896, 6997.9] - - [2048, 200, 1, 512] - - [825, 5649.76] + - [899, 5649.76] - - [4096, 200, 1, 1024] - - [823, 6678.93] + - [897, 6678.93] - - [2048, 200, 1, 4096] - - [831, 6706.69] + - [905, 6706.69] - - [2048, 512, 1, 1024] - - [828, 8549.0] + - [902, 8549.0] - - [1024, 1024, 1, 512] - - [823, 8046.73] + - [897, 8046.73] - - [1024, 200, 1, 4096] - - [822, 5884.36] + - [896, 5884.36] - - [2048, 512, 1, 4096] - - [833, 8995.94] + - [907, 8995.94] - - [4096, 512, 1, 2048] - - [828, 9298.18] + - [902, 9298.18] - - [4096, 1024, 1, 2048] - - [810, 9790.77] + - [884, 9790.77] - - [2048, 1024, 1, 2048] - - [811, 9278.9] + - [885, 9278.9] - - [1024, 200, 1, 512] - - [827, 4535.46] + - [901, 4535.46] - - [1024, 1024, 1, 4096] - - [818, 8967.39] + - [892, 8967.39] - - [2048, 1024, 1, 4096] - - [813, 9500.56] + - [887, 9500.56] - - [4096, 200, 1, 2048] - - [819, 7082.68] + - [893, 7082.68] - - [2048, 200, 1, 1024] - - [825, 6212.04] + - [899, 6212.04] - - [1024, 768, 1, 512] - - [826, 7401.81] + - [900, 7401.81] - - [2048, 512, 1, 512] - - [823, 8124.66] + - [897, 8124.66] - - [2048, 200, 1, 2048] - - [825, 6561.9] + - [899, 6561.9] - - [2048, 256, 1, 2048] - - [826, 8224.23] + - [900, 8224.23] - - [512, 768, 1, 512] - - [824, 6469.46] + - [898, 6469.46] - - [512, 200, 1, 1024] - - [827, 3755.74] + - [901, 3755.74] - - [4096, 1024, 1, 1024] - - [810, 9605.95] + - [884, 9605.95] - - [4096, 256, 1, 4096] - - [833, 8961.39] + - [907, 8961.39] - - [1024, 512, 1, 512] - - [826, 7109.09] + - [900, 7109.09] - - [512, 256, 1, 512] - - [834, 4033.08] + - [908, 4033.08] - - [1024, 256, 1, 4096] - - [822, 7326.4] - - - [4096, 512, 1, 4096] - - [814, 9472.07] + - [896, 7326.4] - - [1024, 200, 1, 2048] - - [815, 5530.56] + - [889, 5530.56] - - [2048, 1024, 1, 512] - - [816, 8995.93] + - [890, 8995.93] - - [1024, 1024, 1, 2048] - - [823, 8830.21] + - [897, 8830.21] - - [4096, 256, 1, 1024] - - [823, 8581.8] + - [897, 8581.8] - - [512, 768, 1, 1024] - - [824, 6876.01] + - [898, 6876.01] - - [1024, 512, 1, 4096] - - [820, 8484.15] + - [894, 8484.15] - - [1024, 256, 1, 512] - - [817, 5668.08] + - [891, 5668.08] - - [4096, 200, 1, 4096] - - [830, 7018.69] + - [904, 7018.69] - - [2048, 256, 1, 512] - - [830, 7079.09] + - [904, 7079.09] - - [512, 200, 1, 2048] - - [835, 4283.5] + - [909, 4283.5] - - [1024, 1024, 1, 1024] - - [818, 8565.37] + - [892, 8565.37] - - [2048, 512, 1, 2048] - - [818, 8850.59] + - [892, 8850.59] - - [4096, 1024, 1, 4096] - - [811, 9843.28] + - [885, 9843.28] - - [2048, 1024, 1, 1024] - - [816, 9234.21] + - [890, 9234.21] - - [4096, 384, 1, 2048] - - [858, 8892.62] + - [932, 8892.62] - - [4096, 192, 1, 2048] - - [852, 8024.28] + - [926, 8024.28] - - [289, 160, 64, 768] - - [854, 6783.73] + - [928, 6783.73] - - [1225, 192, 64, 384] - - [841, 9373.93] + - [915, 9373.93] - - [5329, 64, 64, 160] - - [845, 9186.79] + - [919, 9186.79] - - [1225, 64, 64, 288] - - [836, 8492.51] + - [910, 8492.51] - - [1225, 64, 64, 384] - - [840, 8735.86] + - [914, 8735.86] - - [289, 128, 64, 1024] - - [855, 7000.3] + - [929, 7000.3] - - [4096, 320, 1, 1280] - - [860, 8302.36] + - [934, 8302.36] - - [4096, 384, 1, 1536] - - [842, 9052.55] + - [916, 9052.55] - - [4096, 192, 1, 1280] - - [857, 7561.95] + - [931, 7561.95] - - [289, 192, 64, 768] - - [853, 7882.6] + - [927, 7882.6] - - [1225, 48, 64, 256] - - [844, 6620.35] + - [918, 6620.35] - - [289, 192, 64, 1024] - - [851, 7347.09] + - [925, 7347.09] - - [1225, 64, 64, 192] - - [837, 8098.45] + - [911, 8098.45] - - [1225, 96, 64, 384] - - [838, 8303.18] + - [912, 8303.18] - - [1225, 48, 64, 288] - - [846, 6746.87] + - [920, 6746.87] - - [4096, 320, 1, 2048] - - [847, 8384.52] + - [921, 8384.52] - - [4096, 256, 1, 1536] - - [859, 8734.44] + - [933, 8734.44] - - [1225, 48, 64, 192] - - [846, 6516.46] + - [920, 6516.46] - - [4096, 384, 1, 1280] - - [856, 9023.34] + - [930, 9023.34] - - [1225, 64, 64, 256] - - [843, 8319.44] + - [917, 8319.44] - - [4096, 448, 1, 1280] - - [847, 8343.42] + - [921, 8343.42] - - [289, 128, 64, 768] - - [849, 7668.08] + - [923, 7668.08] - - [289, 256, 64, 1024] - - [850, 7535.56] + - [924, 7535.56] - - [4096, 448, 1, 2048] - - [847, 8572.41] + - [921, 8572.41] - - [5329, 80, 64, 64] - - [846, 6492.54] + - [920, 6492.54] - - [1225, 32, 64, 192] - - [839, 6278.64] + - [913, 6278.64] - - [289, 384, 64, 1024] - - [848, 7767.67] + - [922, 7767.67] - - [1024, 3594, 1, 4096] - - [867, 8661.52] + - [941, 8661.52] - - [4096, 3103, 1, 1024] - - [877, 9652.23] + - [951, 9652.23] - - [4096, 3136, 1, 1024] - - [861, 9723.15] + - [935, 9723.15] - - [1024, 3141, 1, 4096] - - [879, 8612.12] + - [953, 8612.12] - - [64, 147, 432, 148] - - [894, 6372.03] + - [968, 6372.03] - - [4096, 3559, 1, 1024] - - [866, 9906.35] + - [940, 9906.35] - - [4096, 3368, 1, 1024] - - [861, 9721.01] + - [935, 9721.01] - - [1024, 3335, 1, 4096] - - [885, 8990.29] + - [959, 8990.29] - - [1024, 3510, 1, 4096] - - [885, 9440.68] + - [959, 9440.68] - - [4096, 3209, 1, 1024] - - [866, 9632.76] + - [940, 9632.76] - - [4096, 3322, 1, 1024] - - [865, 9939.52] + - [939, 9939.52] - - [1024, 3400, 1, 4096] - - [884, 9156.09] + - [958, 9156.09] - - [1024, 3995, 1, 4096] - - [867, 9610.25] + - [941, 9610.25] - - [1024, 3503, 1, 4096] - - [885, 9446.57] + - [959, 9446.57] - - [4096, 3594, 1, 1024] - - [876, 9691.96] + - [950, 9691.96] - - [4096, 3473, 1, 1024] - - [865, 9698.9] + - [939, 9698.9] - - [4096, 3522, 1, 1024] - - [866, 9816.92] + - [940, 9816.92] - - [1024, 3103, 1, 4096] - - [863, 8491.05] + - [937, 8491.05] - - [1024, 3214, 1, 4096] - - [884, 8667.67] + - [958, 8667.67] - - [4096, 3449, 1, 1024] - - [876, 9795.71] + - [950, 9795.71] - - [1024, 3136, 1, 4096] - - [885, 8500.61] + - [959, 8500.61] - - [1024, 3955, 1, 33708] - - [865, 9634.94] + - [939, 9634.94] - - [1024, 3780, 1, 4096] - - [868, 9088.88] + - [942, 9088.88] - - [1024, 3906, 1, 33708] - - [866, 9515.46] + - [940, 9515.46] - - [1024, 3386, 1, 4096] - - [885, 9116.05] + - [959, 9116.05] - - [4096, 3396, 1, 1024] - - [876, 9665.6] + - [950, 9665.6] - - [1024, 3183, 1, 4096] - - [863, 8662.94] + - [937, 8662.94] - - [1024, 3098, 1, 4096] - - [879, 8490.22] + - [953, 8490.22] - - [1024, 3548, 1, 4096] - - [885, 9555.63] + - [959, 9555.63] - - [1024, 3224, 1, 4096] - - [878, 8760.88] + - [952, 8760.88] - - [4096, 3469, 1, 1024] - - [865, 9687.21] + - [939, 9687.21] - - [1024, 3582, 1, 4096] - - [882, 9691.0] + - [956, 9691.0] - - [1024, 2977, 1, 4096] - - [867, 9379.38] + - [941, 9379.38] - - [1024, 3939, 1, 1024] - - [864, 9172.11] + - [938, 9172.11] - - [64, 123, 528, 123] - - [912, 6346.17] + - [986, 6346.17] - - [64, 12, 5040, 12] - - [889, 1536.1] + - [963, 1536.1] - - [4096, 3176, 1, 1024] - - [877, 9712.2] + - [951, 9712.2] - - [1024, 3559, 1, 4096] - - [881, 9579.84] + - [955, 9579.84] - - [1024, 3478, 1, 4096] - - [885, 9373.85] + - [959, 9373.85] - - [4096, 3343, 1, 1024] - - [861, 9638.77] + - [935, 9638.77] - - [4096, 3440, 1, 1024] - - [861, 9853.96] + - [935, 9853.96] - - [1024, 3996, 1, 33708] - - [865, 9733.55] + - [939, 9733.55] - - [1024, 4012, 1, 4096] - - [866, 9636.99] + - [940, 9636.99] - - [1024, 3322, 1, 4096] - - [885, 8945.12] + - [959, 8945.12] - - [1024, 3990, 1, 33708] - - [865, 9720.31] + - [939, 9720.31] - - [1024, 3314, 1, 4096] - - [885, 8944.72] + - [959, 8944.72] - - [4096, 3513, 1, 1024] - - [865, 9794.95] + - [939, 9794.95] - - [1024, 3562, 1, 4096] - - [885, 9597.28] + - [959, 9597.28] - - [1024, 3443, 1, 4096] - - [885, 9279.52] + - [959, 9279.52] - - [1024, 3554, 1, 4096] - - [882, 9552.16] + - [956, 9552.16] - - [1024, 3063, 1, 4096] - - [867, 9622.58] + - [941, 9622.58] - - [64, 111, 576, 112] - - [912, 6274.65] + - [986, 6274.65] - - [4096, 3460, 1, 1024] - - [865, 9665.69] + - [939, 9665.69] - - [1024, 3209, 1, 4096] - - [864, 8708.39] + - [938, 8708.39] - - [1024, 3147, 1, 4096] - - [885, 8492.23] + - [959, 8492.23] - - [4096, 3387, 1, 1024] - - [862, 9761.34] + - [936, 9761.34] - - [4096, 3436, 1, 1024] - - [861, 9815.15] + - [935, 9815.15] - - [1024, 3341, 1, 4096] - - [884, 9005.07] + - [958, 9005.07] - - [1024, 3516, 1, 4096] - - [884, 9471.39] + - [958, 9471.39] - - [4096, 3277, 1, 1024] - - [865, 9807.12] + - [939, 9807.12] - - [1024, 3454, 1, 4096] - - [885, 9301.03] + - [959, 9301.03] - - [1024, 3969, 1, 4096] - - [865, 9539.82] + - [939, 9539.82] - - [1024, 3999, 1, 4096] - - [866, 9607.52] + - [940, 9607.52] - - [1024, 4032, 1, 4096] - - [867, 9693.47] + - [941, 9693.47] - - [4096, 3541, 1, 1024] - - [866, 9866.73] + - [940, 9866.73] - - [4096, 3334, 1, 1024] - - [877, 9614.41] + - [951, 9614.41] - - [1024, 3365, 1, 4096] - - [885, 9058.58] + - [959, 9058.58] - - [1024, 3527, 1, 4096] - - [885, 9510.31] + - [959, 9510.31] - - [1024, 3190, 1, 4096] - - [884, 8627.8] + - [958, 8627.8] - - [4096, 3906, 1, 1024] - - [862, 9817.78] + - [936, 9817.78] - - [1024, 3593, 1, 4096] - - [867, 8663.09] + - [941, 8663.09] - - [1024, 3336, 1, 4096] - - [885, 8991.13] + - [959, 8991.13] - - [4096, 3504, 1, 1024] - - [865, 9769.86] + - [939, 9769.86] - - [4096, 3977, 1, 1024] - - [866, 9742.62] + - [940, 9742.62] - - [1024, 3906, 1, 4096] - - [866, 9386.25] + - [940, 9386.25] - - [4096, 3415, 1, 1024] - - [876, 9802.7] + - [950, 9802.7] - - [1024, 3295, 1, 4096] - - [884, 8879.26] + - [958, 8879.26] - - [4096, 3321, 1, 1024] - - [866, 9931.43] + - [940, 9931.43] - - [1024, 3072, 1, 4096] - - [867, 9671.71] + - [941, 9671.71] - - [1024, 3408, 1, 4096] - - [884, 9182.83] + - [958, 9182.83] - - [1024, 3522, 1, 4096] - - [885, 9484.63] + - [959, 9484.63] - - [4096, 3751, 1, 1024] - - [866, 9778.86] + - [940, 9778.86] - - [4096, 3378, 1, 1024] - - [876, 9692.77] + - [950, 9692.77] - - [64, 77, 816, 77] - - [918, 4850.29] + - [992, 4850.29] - - [1024, 3925, 1, 33708] - - [865, 9560.88] + - [939, 9560.88] - - [1024, 3990, 1, 1024] - - [867, 9272.75] + - [941, 9272.75] - - [1024, 3290, 1, 4096] - - [878, 8905.61] + - [952, 8905.61] - - [4096, 3500, 1, 1024] - - [866, 9761.82] + - [940, 9761.82] - - [4096, 3565, 1, 1024] - - [865, 9919.37] + - [939, 9919.37] - - [1024, 3484, 1, 4096] - - [884, 9376.52] + - [958, 9376.52] - - [4096, 3395, 1, 1024] - - [877, 9788.16] + - [951, 9788.16] - - [64, 92, 688, 92] - - [904, 5606.1] + - [978, 5606.1] - - [1024, 3681, 1, 1024] - - [869, 8690.23] + - [943, 8690.23] - - [64, 159, 400, 159] - - [896, 6518.97] + - [970, 6518.97] - - [1024, 3584, 1, 1024] - - [884, 9365.37] + - [958, 9365.37] - - [4096, 3093, 1, 1024] - - [876, 9623.41] + - [950, 9623.41] - - [1024, 4050, 1, 1024] - - [868, 9354.14] + - [942, 9354.14] - - [1024, 3301, 1, 4096] - - [885, 8889.04] + - [959, 8889.04] - - [1024, 3581, 1, 4096] - - [884, 9673.82] + - [958, 9673.82] - - [4096, 3374, 1, 1024] - - [877, 9707.33] + - [951, 9707.33] - - [1024, 3449, 1, 4096] - - [885, 9270.9] + - [959, 9270.9] - - [4096, 3215, 1, 1024] - - [866, 9645.25] + - [940, 9645.25] - - [4096, 3312, 1, 1024] - - [866, 9888.72] + - [940, 9888.72] - - [4096, 3479, 1, 1024] - - [866, 9698.61] + - [940, 9698.61] - - [4096, 3544, 1, 1024] - - [866, 9875.09] + - [940, 9875.09] - - [1024, 3263, 1, 4096] - - [885, 8787.61] + - [959, 8787.61] - - [4096, 3455, 1, 1024] - - [876, 9845.29] + - [950, 9845.29] - - [1024, 3379, 1, 4096] - - [882, 9100.01] + - [956, 9100.01] - - [1024, 3490, 1, 4096] - - [885, 9397.49] + - [959, 9397.49] - - [1024, 3368, 1, 4096] - - [885, 9079.25] + - [959, 9079.25] - - [4096, 3186, 1, 1024] - - [861, 9750.17] + - [935, 9750.17] - - [1024, 3428, 1, 4096] - - [885, 9232.92] + - [959, 9232.92] - - [64, 85, 752, 84] - - [900, 5342.67] + - [974, 5342.67] - - [4096, 3561, 1, 1024] - - [866, 9914.02] + - [940, 9914.02] - - [4096, 3418, 1, 1024] - - [876, 9765.86] + - [950, 9765.86] - - [1024, 3064, 1, 4096] - - [867, 9621.68] + - [941, 9621.68] - - [4096, 3259, 1, 1024] - - [866, 9765.52] + - [940, 9765.52] - - [4096, 3308, 1, 1024] - - [865, 9900.46] + - [939, 9900.46] - - [1024, 3533, 1, 4096] - - [885, 9520.12] + - [959, 9520.12] - - [1024, 3344, 1, 4096] - - [885, 9014.55] + - [959, 9014.55] - - [1024, 4030, 1, 1024] - - [867, 9354.1] + - [941, 9354.1] - - [4096, 3459, 1, 1024] - - [866, 9656.2] + - [940, 9656.2] - - [1024, 3572, 1, 4096] - - [882, 9640.07] + - [956, 9640.07] - - [1024, 3925, 1, 1024] - - [878, 9173.74] + - [952, 9173.74] - - [4096, 3435, 1, 1024] - - [861, 9778.2] + - [935, 9778.2] - - [1024, 3956, 1, 4096] - - [868, 9498.56] + - [942, 9498.56] - - [1024, 3463, 1, 4096] - - [885, 9332.46] + - [959, 9332.46] - - [4096, 3182, 1, 1024] - - [876, 9826.84] + - [950, 9826.84] - - [4096, 3976, 1, 1024] - - [876, 9741.99] + - [950, 9741.99] - - [1024, 3417, 1, 4096] - - [885, 9208.97] + - [959, 9208.97] - - [1024, 3528, 1, 4096] - - [885, 9509.09] + - [959, 9509.09] - - [4096, 3446, 1, 1024] - - [876, 9816.97] + - [950, 9816.97] - - [64, 122, 528, 123] - - [912, 6325.98] + - [986, 6325.98] - - [1024, 3543, 1, 4096] - - [885, 9538.73] + - [959, 9538.73] - - [4096, 3287, 1, 1024] - - [865, 9846.04] + - [939, 9846.04] - - [1024, 3499, 1, 4096] - - [885, 9428.51] + - [959, 9428.51] - - [1024, 3231, 1, 4096] - - [878, 8769.91] + - [952, 8769.91] - - [64, 17, 3632, 17] - - [900, 1934.94] + - [974, 1934.94] - - [4096, 3519, 1, 1024] - - [865, 9804.38] + - [939, 9804.38] - - [4096, 3552, 1, 1024] - - [865, 9892.65] + - [939, 9892.65] - - [1024, 3458, 1, 4096] - - [885, 9312.28] + - [959, 9312.28] - - [64, 93, 688, 92] - - [904, 5660.22] + - [978, 5660.22] - - [1024, 3374, 1, 4096] - - [879, 9110.41] + - [953, 9110.41] - - [1024, 3396, 1, 4096] - - [885, 9145.79] + - [959, 9145.79] - - [1024, 2967, 1, 4096] - - [867, 9364.76] + - [941, 9364.76] - - [64, 19, 3264, 19] - - [904, 2142.47] + - [978, 2142.47] - - [4096, 3482, 1, 1024] - - [865, 9714.2] + - [939, 9714.2] - - [64, 32, 1984, 32] - - [915, 3619.91] + - [989, 3619.91] - - [64, 102, 624, 99] - - [906, 5515.33] + - [980, 5515.33] - - [1024, 3226, 1, 4096] - - [864, 8790.47] + - [938, 8790.47] - - [4096, 3377, 1, 1024] - - [862, 9684.08] + - [936, 9684.08] - - [4096, 3426, 1, 1024] - - [877, 9869.94] + - [951, 9869.94] - - [4096, 2935, 1, 1024] - - [877, 9762.11] + - [951, 9762.11] - - [64, 133, 480, 133] - - [916, 5891.32] + - [990, 5891.32] - - [1024, 3439, 1, 4096] - - [885, 9253.99] + - [959, 9253.99] - - [4096, 3267, 1, 1024] - - [865, 9783.9] + - [939, 9783.9] - - [4096, 3499, 1, 1024] - - [866, 9761.11] + - [940, 9761.11] - - [4096, 3356, 1, 1024] - - [877, 9679.44] + - [951, 9679.44] - - [64, 232, 272, 232] - - [920, 7181.03] + - [994, 7181.03] - - [64, 162, 400, 159] - - [880, 6444.63] + - [954, 6444.63] - - [4096, 3939, 1, 1024] - - [876, 9878.0] + - [950, 9878.0] - - [1024, 3526, 1, 4096] - - [885, 9508.1] + - [959, 9508.1] - - [1024, 3859, 1, 33708] - - [866, 9402.13] + - [940, 9402.13] - - [1024, 3385, 1, 4096] - - [884, 9107.28] + - [958, 9107.28] - - [1024, 3496, 1, 4096] - - [885, 9418.0] + - [959, 9418.0] - - [4096, 3141, 1, 1024] - - [877, 9682.54] + - [951, 9682.54] - - [4096, 3510, 1, 1024] - - [865, 9786.59] + - [939, 9786.59] - - [1024, 3434, 1, 4096] - - [885, 9246.7] + - [959, 9246.7] - - [4096, 3969, 1, 1024] - - [865, 9714.85] + - [939, 9714.85] - - [1024, 3121, 1, 4096] - - [863, 8464.32] + - [937, 8464.32] - - [1024, 3232, 1, 4096] - - [885, 8711.73] + - [959, 8711.73] - - [1024, 4030, 1, 33708] - - [866, 9816.31] + - [940, 9816.31] - - [1024, 3780, 1, 33708] - - [874, 9315.54] + - [948, 9315.54] - - [1024, 3969, 1, 1024] - - [863, 9248.54] + - [937, 9248.54] - - [4096, 3527, 1, 1024] - - [865, 9832.94] + - [939, 9832.94] - - [4096, 3336, 1, 1024] - - [862, 9623.35] + - [936, 9623.35] - - [4096, 3290, 1, 1024] - - [865, 9852.21] + - [939, 9852.21] - - [64, 9, 6544, 9] - - [905, 1068.24] + - [979, 1068.24] - - [1024, 3469, 1, 4096] - - [885, 9350.55] + - [959, 9350.55] - - [4096, 3490, 1, 1024] - - [865, 9737.56] + - [939, 9737.56] - - [4096, 3064, 1, 1024] - - [865, 9890.02] + - [939, 9890.02] - - [4096, 3582, 1, 1024] - - [866, 9961.38] + - [940, 9961.38] - - [1024, 3956, 1, 1024] - - [863, 9294.25] + - [937, 9294.25] - - [4096, 3417, 1, 1024] - - [861, 9811.66] + - [935, 9811.66] - - [1024, 2736, 1, 4096] - - [867, 8636.7] + - [941, 8636.7] - - [64, 78, 816, 78] - - [904, 4946.1] + - [978, 4946.1] - - [1024, 3205, 1, 4096] - - [879, 8657.21] + - [953, 8657.21] - - [1024, 3143, 1, 4096] - - [879, 8567.87] + - [953, 8567.87] - - [1024, 4020, 1, 4096] - - [867, 9664.62] + - [941, 9664.62] - - [1024, 3318, 1, 4096] - - [864, 8967.05] + - [938, 8967.05] - - [4096, 3364, 1, 1024] - - [877, 9697.18] + - [951, 9697.18] - - [1024, 3353, 1, 4096] - - [885, 9034.17] + - [959, 9034.17] - - [1024, 3464, 1, 4096] - - [885, 9326.05] + - [959, 9326.05] - - [4096, 3205, 1, 1024] - - [865, 9619.1] + - [939, 9619.1] - - [4096, 3318, 1, 1024] - - [866, 9932.66] + - [940, 9932.66] - - [1024, 3402, 1, 4096] - - [884, 9153.49] + - [958, 9153.49] - - [4096, 3181, 1, 1024] - - [876, 9789.15] + - [950, 9789.15] - - [4096, 3550, 1, 1024] - - [866, 9888.13] + - [940, 9888.13] - - [4096, 3445, 1, 1024] - - [876, 9752.65] + - [950, 9752.65] - - [1024, 3138, 1, 4096] - - [862, 8484.1] + - [936, 8484.1] - - [64, 99, 624, 99] - - [912, 5323.99] + - [986, 5323.99] - - [4096, 3079, 1, 1024] - - [862, 9562.26] + - [936, 9562.26] - - [4096, 3144, 1, 1024] - - [876, 9686.66] + - [950, 9686.66] - - [4096, 3860, 1, 1024] - - [877, 9733.42] + - [951, 9733.42] - - [1024, 3515, 1, 4096] - - [885, 9478.44] + - [959, 9478.44] - - [4096, 3408, 1, 1024] - - [862, 9764.96] + - [936, 9764.96] - - [64, 101, 624, 102] - - [912, 5482.79] + - [986, 5482.79] - - [1024, 3181, 1, 4096] - - [864, 8593.26] + - [938, 8593.26] - - [4096, 3298, 1, 1024] - - [866, 9867.72] + - [940, 9867.72] - - [4096, 3585, 1, 1024] - - [876, 9633.01] + - [950, 9633.01] - - [1024, 3550, 1, 4096] - - [885, 9564.46] + - [959, 9564.46] - - [1024, 4020, 1, 1024] - - [868, 9339.15] + - [942, 9339.15] - - [4096, 3481, 1, 1024] - - [866, 9714.0] + - [940, 9714.0] - - [4096, 3530, 1, 1024] - - [866, 9833.99] + - [940, 9833.99] - - [4096, 3425, 1, 1024] - - [862, 9675.66] + - [936, 9675.66] - - [4096, 4026, 1, 1024] - - [866, 9849.77] + - [940, 9849.77] - - [1024, 3860, 1, 1024] - - [879, 9073.59] + - [953, 9073.59] - - [4096, 3975, 1, 1024] - - [866, 9737.72] + - [940, 9737.72] - - [1024, 3286, 1, 4096] - - [863, 8884.24] + - [937, 8884.24] - - [1024, 3176, 1, 4096] - - [863, 8597.48] + - [937, 8597.48] - - [1024, 3894, 1, 4096] - - [867, 9359.13] + - [941, 9359.13] - - [4096, 3355, 1, 1024] - - [876, 9693.09] + - [950, 9693.09] - - [4096, 3404, 1, 1024] - - [876, 9786.12] + - [950, 9786.12] - - [1024, 3501, 1, 4096] - - [884, 9426.14] + - [958, 9426.14] - - [4096, 3245, 1, 1024] - - [866, 9723.57] + - [940, 9723.57] - - [1024, 3431, 1, 4096] - - [882, 9244.32] + - [956, 9244.32] - - [1024, 4000, 1, 1024] - - [878, 9344.03] + - [952, 9344.03] - - [4096, 3509, 1, 1024] - - [865, 9781.72] + - [939, 9781.72] - - [4096, 3558, 1, 1024] - - [866, 9905.15] + - [940, 9905.15] - - [1024, 3535, 1, 4096] - - [884, 9519.15] + - [958, 9519.15] - - [1024, 3414, 1, 4096] - - [882, 9198.05] + - [956, 9198.05] - - [1024, 3445, 1, 4096] - - [885, 9279.66] + - [959, 9279.66] - - [1024, 3436, 1, 4096] - - [885, 9259.7] + - [959, 9259.7] - - [4096, 3472, 1, 1024] - - [866, 9685.27] + - [940, 9685.27] - - [1024, 3211, 1, 4096] - - [864, 8708.41] + - [938, 8708.41] - - [64, 7, 8192, 7] - - [901, 802.916] + - [975, 802.916] - - [4096, 3383, 1, 1024] - - [876, 9734.82] + - [950, 9734.82] - - [4096, 3448, 1, 1024] - - [877, 9828.54] + - [951, 9828.54] - - [1024, 3343, 1, 4096] - - [878, 9010.46] + - [952, 9010.46] - - [1024, 3518, 1, 4096] - - [885, 9468.02] + - [959, 9468.02] - - [4096, 3289, 1, 1024] - - [866, 9844.16] + - [940, 9844.16] - - [1024, 3440, 1, 4096] - - [881, 9269.52] + - [955, 9269.52] - - [1024, 4032, 1, 33708] - - [865, 9822.41] + - [939, 9822.41] - - [4096, 3489, 1, 1024] - - [865, 9742.03] + - [939, 9742.03] - - [4096, 3346, 1, 1024] - - [862, 9616.74] + - [936, 9616.74] - - [1024, 3534, 1, 4096] - - [884, 9524.29] + - [958, 9524.29] - - [1024, 3079, 1, 4096] - - [879, 8397.77] + - [953, 8397.77] - - [1024, 3955, 1, 4096] - - [866, 9492.25] + - [940, 9492.25] - - [4096, 3236, 1, 1024] - - [866, 9706.03] + - [940, 9706.03] - - [1024, 3545, 1, 4096] - - [884, 9551.97] + - [958, 9551.97] - - [1024, 3144, 1, 4096] - - [878, 8556.8] + - [952, 8556.8] - - [4096, 3780, 1, 1024] - - [865, 9847.6] + - [939, 9847.6] - - [4096, 3163, 1, 1024] - - [876, 9717.79] + - [950, 9717.79] - - [4096, 3468, 1, 1024] - - [866, 9686.49] + - [940, 9686.49] - - [1024, 3539, 1, 4096] - - [885, 9526.99] + - [959, 9526.99] - - [1024, 3541, 1, 4096] - - [885, 9532.86] + - [959, 9532.86] - - [4096, 3363, 1, 1024] - - [861, 9699.1] + - [935, 9699.1] - - [1024, 3475, 1, 4096] - - [885, 9357.1] + - [959, 9357.1] - - [4096, 3110, 1, 1024] - - [877, 9659.68] + - [951, 9659.68] - - [1024, 3509, 1, 4096] - - [884, 9450.59] + - [958, 9450.59] - - [1024, 3413, 1, 4096] - - [885, 9185.91] + - [959, 9185.91] - - [1024, 3975, 1, 1024] - - [863, 9315.52] + - [937, 9315.52] - - [4096, 3549, 1, 1024] - - [866, 9884.82] + - [940, 9884.82] - - [4096, 3342, 1, 1024] - - [876, 9644.37] + - [950, 9644.37] - - [1024, 2985, 1, 4096] - - [866, 9392.17] + - [940, 9392.17] - - [1024, 3876, 1, 33708] - - [865, 9442.32] + - [939, 9442.32] - - [4096, 3280, 1, 1024] - - [865, 9820.02] + - [939, 9820.02] - - [4096, 3191, 1, 1024] - - [877, 9862.18] + - [951, 9862.18] - - [4096, 3512, 1, 1024] - - [866, 9793.21] + - [940, 9793.21] - - [1024, 3560, 1, 4096] - - [882, 9555.55] + - [956, 9555.55] - - [4096, 2499, 1, 1024] - - [866, 9669.45] + - [940, 9669.45] - - [1024, 3248, 1, 4096] - - [863, 8811.94] + - [937, 8811.94] - - [4096, 3423, 1, 1024] - - [877, 9729.77] + - [951, 9729.77] - - [64, 111, 576, 111] - - [912, 5982.73] + - [986, 5982.73] - - [4096, 3297, 1, 1024] - - [865, 9865.29] + - [939, 9865.29] - - [4096, 3154, 1, 1024] - - [877, 9613.52] + - [951, 9613.52] - - [1024, 3303, 1, 4096] - - [864, 8951.89] + - [938, 8951.89] - - [1024, 3222, 1, 4096] - - [884, 8682.99] + - [958, 8682.99] - - [1024, 3978, 1, 1024] - - [868, 9235.03] + - [942, 9235.03] - - [4096, 3529, 1, 1024] - - [866, 9831.72] + - [940, 9831.72] - - [4096, 3386, 1, 1024] - - [876, 9755.77] + - [950, 9755.77] - - [64, 134, 480, 134] - - [891, 5990.63] + - [965, 5990.63] - - [1024, 3451, 1, 4096] - - [882, 9277.71] + - [956, 9277.71] - - [4096, 3562, 1, 1024] - - [866, 9908.92] + - [940, 9908.92] - - [4096, 3276, 1, 1024] - - [865, 9818.14] + - [939, 9818.14] - - [64, 135, 480, 132] - - [920, 6071.87] + - [994, 6071.87] - - [1024, 3894, 1, 33708] - - [865, 9487.89] + - [939, 9487.89] - - [64, 134, 480, 132] - - [919, 6091.75] + - [993, 6091.75] - - [4096, 3540, 1, 1024] - - [866, 9862.89] + - [940, 9862.89] - - [1024, 3416, 1, 4096] - - [884, 9206.27] + - [958, 9206.27] - - [1024, 4005, 1, 33708] - - [865, 9757.29] + - [939, 9757.29] - - [1024, 3942, 1, 4096] - - [868, 9455.85] + - [942, 9455.85] - - [4096, 3403, 1, 1024] - - [876, 9739.46] + - [950, 9739.46] - - [4096, 3381, 1, 1024] - - [877, 9760.14] + - [951, 9760.14] - - [1024, 3492, 1, 4096] - - [881, 9391.79] + - [955, 9391.79] - - [4096, 3101, 1, 1024] - - [877, 9626.02] + - [951, 9626.02] - - [1024, 3430, 1, 4096] - - [885, 9232.14] + - [959, 9232.14] - - [1024, 3977, 1, 4096] - - [868, 9563.0] + - [942, 9563.0] - - [1024, 3640, 1, 4096] - - [867, 8761.5] + - [941, 8761.5] - - [4096, 3557, 1, 1024] - - [866, 9905.52] + - [940, 9905.52] - - [4096, 3414, 1, 1024] - - [862, 9755.49] + - [936, 9755.49] - - [1024, 3391, 1, 4096] - - [885, 9142.66] + - [959, 9142.66] - - [64, 134, 480, 135] - - [894, 5922.15] + - [968, 5922.15] - - [64, 16, 3840, 16] - - [910, 2080.61] + - [984, 2080.61] - - [1024, 3356, 1, 4096] - - [885, 9051.09] + - [959, 9051.09] - - [4096, 3320, 1, 1024] - - [866, 9929.57] + - [940, 9929.57] - - [4096, 2765, 1, 1024] - - [866, 9750.28] + - [940, 9750.28] - - [64, 162, 400, 162] - - [883, 6515.29] + - [957, 6515.29] - - [1024, 3411, 1, 4096] - - [885, 9185.72] + - [959, 9185.72] - - [1024, 3978, 1, 4096] - - [865, 9562.77] + - [939, 9562.77] - - [4096, 3487, 1, 1024] - - [866, 9733.85] + - [940, 9733.85] - - [4096, 3520, 1, 1024] - - [865, 9813.95] + - [939, 9813.95] - - [4096, 3942, 1, 1024] - - [876, 9804.39] + - [950, 9804.39] - - [4096, 3431, 1, 1024] - - [861, 9819.06] + - [935, 9819.06] - - [1024, 3271, 1, 4096] - - [878, 8913.08] + - [952, 8913.08] - - [4096, 4020, 1, 1024] - - [865, 9831.42] + - [939, 9831.42] - - [1024, 3481, 1, 4096] - - [881, 9376.15] + - [955, 9376.15] - - [1024, 3419, 1, 4096] - - [884, 9208.68] + - [958, 9208.68] - - [1024, 4059, 1, 4096] - - [868, 9733.83] + - [942, 9733.83] - - [4096, 3345, 1, 1024] - - [877, 9651.43] + - [951, 9651.43] - - [4096, 3394, 1, 1024] - - [877, 9780.43] + - [951, 9780.43] - - [1024, 3298, 1, 4096] - - [884, 8889.63] + - [958, 8889.63] - - [4096, 3235, 1, 1024] - - [866, 9705.81] + - [940, 9705.81] - - [1024, 3681, 1, 33708] - - [873, 9146.22] + - [947, 9146.22] - - [1024, 3840, 1, 4096] - - [866, 9253.95] + - [940, 9253.95] - - [1024, 3362, 1, 4096] - - [885, 9059.81] + - [959, 9059.81] - - [4096, 3467, 1, 1024] - - [865, 9677.51] + - [939, 9677.51] - - [1024, 3349, 1, 4096] - - [885, 9034.07] + - [959, 9034.07] - - [1024, 3460, 1, 4096] - - [885, 9322.94] + - [959, 9322.94] - - [4096, 3214, 1, 1024] - - [866, 9644.46] + - [940, 9644.46] - - [1024, 3398, 1, 4096] - - [885, 9157.29] + - [959, 9157.29] - - [4096, 3478, 1, 1024] - - [865, 9706.66] + - [939, 9706.66] - - [1024, 4050, 1, 33708] - - [865, 9865.14] + - [939, 9865.14] - - [1024, 3244, 1, 4096] - - [881, 8744.53] + - [955, 8744.53] - - [4096, 3341, 1, 1024] - - [877, 9646.79] + - [951, 9646.79] - - [4096, 3454, 1, 1024] - - [862, 9880.56] + - [936, 9880.56] - - [1024, 3166, 1, 4096] - - [879, 8618.46] + - [953, 8618.46] - - [1024, 3425, 1, 4096] - - [885, 9225.32] + - [959, 9225.32] - - [4096, 3295, 1, 1024] - - [866, 9863.81] + - [940, 9863.81] - - [4096, 3072, 1, 1024] - - [865, 9971.09] + - [939, 9971.09] - - [4096, 3822, 1, 1024] - - [866, 9952.07] + - [940, 9952.07] - - [1024, 3681, 1, 4096] - - [867, 8856.94] + - [941, 8856.94] - - [1024, 4050, 1, 4096] - - [867, 9717.58] + - [941, 9717.58] - - [4096, 3495, 1, 1024] - - [865, 9741.14] + - [939, 9741.14] - - [4096, 3560, 1, 1024] - - [866, 9909.14] + - [940, 9909.14] - - [1024, 3524, 1, 4096] - - [884, 9503.2] + - [958, 9503.2] - - [1024, 3942, 1, 33708] - - [865, 9602.67] + - [939, 9602.67] - - [1024, 3304, 1, 4096] - - [864, 8928.76] + - [938, 8928.76] - - [1024, 3387, 1, 4096] - - [885, 9127.65] + - [959, 9127.65] - - [1024, 3498, 1, 4096] - - [884, 9423.39] + - [958, 9423.39] - - [4096, 3458, 1, 1024] - - [865, 9642.63] + - [939, 9642.63] - - [4096, 2967, 1, 1024] - - [865, 9626.71] + - [939, 9626.71] - - [64, 8, 7280, 8] - - [887, 1032.61] + - [961, 1032.61] - - [4096, 3385, 1, 1024] - - [861, 9735.77] + - [935, 9735.77] - - [4096, 3434, 1, 1024] - - [876, 9808.9] + - [950, 9808.9] - - [1024, 3519, 1, 4096] - - [885, 9484.83] + - [959, 9484.83] - - [1024, 3511, 1, 4096] - - [885, 9456.47] + - [959, 9456.47] - - [1024, 3288, 1, 4096] - - [884, 8864.05] + - [958, 8864.05] - - [1024, 2918, 1, 4096] - - [867, 9170.35] + - [941, 9170.35] - - [4096, 3573, 1, 1024] - - [866, 9945.85] + - [940, 9945.85] - - [1024, 3822, 1, 33708] - - [875, 9331.0] + - [949, 9331.0] - - [64, 102, 624, 102] - - [912, 5531.17] + - [986, 5531.17] - - [4096, 3539, 1, 1024] - - [866, 9855.39] + - [940, 9855.39] - - [4096, 3332, 1, 1024] - - [877, 9648.97] + - [951, 9648.97] - - [4096, 3286, 1, 1024] - - [866, 9846.42] + - [940, 9846.42] - - [1024, 4026, 1, 4096] - - [867, 9675.94] + - [941, 9675.94] - - [1024, 3277, 1, 4096] - - [881, 8836.21] + - [955, 8836.21] - - [1024, 3471, 1, 4096] - - [885, 9346.33] + - [959, 9346.33] - - [4096, 3518, 1, 1024] - - [866, 9804.2] + - [940, 9804.2] - - [1024, 3393, 1, 4096] - - [885, 9148.99] + - [959, 9148.99] - - [4096, 3413, 1, 1024] - - [862, 9785.17] + - [936, 9785.17] - - [4096, 3303, 1, 1024] - - [866, 9884.37] + - [940, 9884.37] - - [1024, 3207, 1, 4096] - - [863, 8714.69] + - [937, 8714.69] - - [1024, 3894, 1, 1024] - - [879, 9181.51] + - [953, 9181.51] - - [1024, 3977, 1, 1024] - - [879, 9240.9] + - [953, 9240.9] - - [64, 135, 480, 133] - - [894, 5923.4] + - [968, 5923.4] - - [4096, 3535, 1, 1024] - - [866, 9839.55] + - [940, 9839.55] - - [4096, 3376, 1, 1024] - - [861, 9712.02] + - [935, 9712.02] - - [1024, 3355, 1, 4096] - - [885, 9043.27] + - [959, 9043.27] - - [64, 27, 2336, 27] - - [913, 2929.9] + - [987, 2929.9] - - [1024, 3466, 1, 4096] - - [885, 9339.1] + - [959, 9339.1] - - [4096, 3266, 1, 1024] - - [866, 9789.29] + - [940, 9789.29] - - [1024, 3404, 1, 4096] - - [885, 9176.76] + - [959, 9176.76] - - [1024, 3999, 1, 1024] - - [878, 9391.91] + - [952, 9391.91] - - [64, 148, 432, 143] - - [891, 6182.92] + - [965, 6182.92] - - [4096, 3498, 1, 1024] - - [865, 9764.56] + - [939, 9764.56] - - [1024, 4032, 1, 1024] - - [863, 9402.03] + - [937, 9402.03] - - [1024, 3410, 1, 4096] - - [884, 9183.5] + - [958, 9183.5] - - [4096, 3393, 1, 1024] - - [877, 9695.49] + - [951, 9695.49] - - [1024, 3140, 1, 4096] - - [878, 8504.86] + - [952, 8504.86] - - [1024, 3910, 1, 33708] - - [865, 9526.06] + - [939, 9526.06] - - [1024, 3334, 1, 4096] - - [884, 8987.59] + - [958, 8987.59] - - [4096, 3140, 1, 1024] - - [877, 9660.71] + - [951, 9660.71] - - [1024, 4005, 1, 4096] - - [868, 9629.88] + - [942, 9629.88] - - [1024, 3579, 1, 4096] - - [884, 9661.45] + - [958, 9661.45] - - [4096, 3372, 1, 1024] - - [877, 9697.32] + - [951, 9697.32] - - [1024, 3245, 1, 4096] - - [878, 8847.76] + - [952, 8847.76] - - [64, 38, 1680, 38] - - [888, 3340.44] + - [962, 3340.44] - - [4096, 3956, 1, 1024] - - [877, 9911.15] + - [951, 9911.15] - - [4096, 3213, 1, 1024] - - [865, 9643.11] + - [939, 9643.11] - - [1024, 3361, 1, 4096] - - [885, 9062.24] + - [959, 9062.24] - - [1024, 3536, 1, 4096] - - [884, 9530.65] + - [958, 9530.65] - - [1024, 3968, 1, 1024] - - [879, 9377.92] + - [953, 9377.92] - - [4096, 3477, 1, 1024] - - [866, 9700.77] + - [940, 9700.77] - - [4096, 3526, 1, 1024] - - [866, 9824.41] + - [940, 9824.41] - - [1024, 4005, 1, 1024] - - [863, 9362.39] + - [937, 9362.39] - - [1024, 3530, 1, 4096] - - [882, 9487.17] + - [956, 9487.17] - - [1024, 3944, 1, 4096] - - [867, 9464.55] + - [941, 9464.55] - - [4096, 3453, 1, 1024] - - [876, 9826.77] + - [950, 9826.77] - - [4096, 3184, 1, 1024] - - [877, 9833.59] + - [951, 9833.59] - - [4096, 3579, 1, 1024] - - [866, 9962.55] + - [940, 9962.55] - - [4096, 3351, 1, 1024] - - [877, 9653.34] + - [951, 9653.34] - - [4096, 3416, 1, 1024] - - [861, 9810.4] + - [935, 9810.4] - - [64, 100, 624, 100] - - [912, 5408.55] + - [986, 5408.55] - - [1024, 3822, 1, 4096] - - [867, 9196.2] + - [941, 9196.2] - - [1024, 3796, 1, 4096] - - [867, 9131.96] + - [941, 9131.96] - - [4096, 3257, 1, 1024] - - [865, 9767.34] + - [939, 9767.34] - - [4096, 3306, 1, 1024] - - [865, 9893.35] + - [939, 9893.35] - - [1024, 3505, 1, 4096] - - [885, 9450.02] + - [959, 9450.02] - - [1024, 3315, 1, 4096] - - [878, 8979.77] + - [952, 8979.77] - - [1024, 3486, 1, 4096] - - [884, 9393.48] + - [958, 9393.48] - - [4096, 3457, 1, 1024] - - [865, 9653.19] + - [939, 9653.19] - - [4096, 3870, 1, 1024] - - [862, 9717.51] + - [936, 9717.51] - - [1024, 3447, 1, 4096] - - [885, 9273.14] + - [959, 9273.14] - - [1024, 3558, 1, 4096] - - [882, 9567.33] + - [956, 9567.33] - - [4096, 3433, 1, 1024] - - [862, 9759.26] + - [936, 9759.26] - - [4096, 3180, 1, 1024] - - [877, 9738.63] + - [951, 9738.63] - - [1024, 3213, 1, 4096] - - [863, 8692.25] + - [937, 8692.25] - - [1024, 3900, 1, 4096] - - [867, 9388.61] + - [941, 9388.61] - - [4096, 3444, 1, 1024] - - [876, 9869.73] + - [950, 9869.73] - - [1024, 3504, 1, 4096] - - [885, 9429.38] + - [959, 9429.38] - - [4096, 4059, 1, 1024] - - [866, 9920.79] + - [940, 9920.79] - - [1024, 3442, 1, 4096] - - [885, 9273.01] + - [959, 9273.01] - - [4096, 3517, 1, 1024] - - [865, 9808.19] + - [939, 9808.19] - - [1024, 3566, 1, 4096] - - [884, 9622.89] + - [958, 9622.89] - - [4096, 3248, 1, 1024] - - [865, 9730.33] + - [939, 9730.33] - - [1024, 3547, 1, 4096] - - [884, 9564.73] + - [958, 9564.73] - - [64, 59, 1088, 59] - - [903, 4611.76] + - [977, 4611.76] - - [1024, 3340, 1, 4096] - - [884, 8992.21] + - [958, 8992.21] - - [4096, 3480, 1, 1024] - - [866, 9710.17] + - [940, 9710.17] - - [1024, 3968, 1, 4096] - - [866, 9543.11] + - [940, 9543.11] - - [4096, 3424, 1, 1024] - - [862, 9808.66] + - [936, 9808.66] - - [1024, 3906, 1, 1024] - - [864, 9150.54] + - [938, 9150.54] - - [4096, 3265, 1, 1024] - - [865, 9786.85] + - [939, 9786.85] - - [1024, 3384, 1, 4096] - - [885, 9119.56] + - [959, 9119.56] - - [1024, 3494, 1, 4096] - - [882, 9415.52] + - [956, 9415.52] - - [1024, 3236, 1, 4096] - - [879, 8767.14] + - [953, 8767.14] - - [4096, 3497, 1, 1024] - - [866, 9750.86] + - [940, 9750.86] - - [4096, 3354, 1, 1024] - - [877, 9665.17] + - [951, 9665.17] - - [4096, 3055, 1, 1024] - - [866, 9884.09] + - [940, 9884.09] - - [64, 11, 5456, 11] - - [889, 1368.34] + - [963, 1368.34] - - [4096, 3244, 1, 1024] - - [865, 9720.02] + - [939, 9720.02] - - [4096, 3139, 1, 1024] - - [876, 9737.06] + - [950, 9737.06] - - [4096, 3508, 1, 1024] - - [865, 9771.66] + - [939, 9771.66] - - [4096, 4050, 1, 1024] - - [865, 9898.79] + - [939, 9898.79] - - [1024, 3472, 1, 4096] - - [884, 9353.83] + - [958, 9353.83] - - [1024, 3861, 1, 1024] - - [863, 9061.32] + - [937, 9061.32] - - [1024, 3910, 1, 1024] - - [867, 9043.54] + - [941, 9043.54] - - [4096, 3371, 1, 1024] - - [877, 9738.24] + - [951, 9738.24] - - [64, 65, 992, 65] - - [916, 4354.59] + - [990, 4354.59] - - [1024, 3751, 1, 4096] - - [866, 9018.74] + - [940, 9018.74] - - [4096, 3325, 1, 1024] - - [865, 9958.73] + - [939, 9958.73] - - [1024, 3321, 1, 4096] - - [885, 8952.55] + - [959, 8952.55] - - [1024, 3944, 1, 1024] - - [864, 9117.35] + - [938, 9117.35] - - [4096, 3525, 1, 1024] - - [866, 9822.14] + - [940, 9822.14] - - [4096, 3382, 1, 1024] - - [877, 9720.21] + - [951, 9720.21] - - [64, 122, 528, 122] - - [912, 6389.33] + - [986, 6389.33] - - [1024, 3453, 1, 4096] - - [882, 9305.03] + - [956, 9305.03] - - [4096, 3564, 1, 1024] - - [865, 9911.32] + - [939, 9911.32] - - [4096, 3288, 1, 1024] - - [865, 9841.17] + - [939, 9841.17] - - [1024, 3925, 1, 4096] - - [866, 9418.95] + - [940, 9418.95] - - [1024, 3057, 1, 4096] - - [867, 9590.51] + - [941, 9590.51] - - [4096, 3488, 1, 1024] - - [866, 9732.5] + - [940, 9732.5] - - [4096, 3046, 1, 1024] - - [866, 9850.72] + - [940, 9850.72] - - [1024, 3189, 1, 4096] - - [878, 8677.02] + - [952, 8677.02] - - [4096, 3399, 1, 1024] - - [862, 9673.09] + - [936, 9673.09] - - [1024, 3383, 1, 4096] - - [885, 9102.37] + - [959, 9102.37] - - [1024, 3415, 1, 4096] - - [885, 9216.37] + - [959, 9216.37] - - [1024, 3388, 1, 4096] - - [885, 9127.53] + - [959, 9127.53] - - [1024, 3376, 1, 4096] - - [882, 9090.53] + - [956, 9090.53] - - [1024, 3473, 1, 4096] - - [885, 9354.12] + - [959, 9354.12] - - [4096, 3162, 1, 1024] - - [861, 9694.83] + - [935, 9694.83] - - [1024, 3448, 1, 4096] - - [885, 9283.45] + - [959, 9283.45] - - [4096, 3362, 1, 1024] - - [877, 9673.33] + - [951, 9673.33] - - [64, 228, 272, 228] - - [870, 7039.13] + - [944, 7039.13] - - [1024, 3262, 1, 4096] - - [879, 8850.84] + - [953, 8850.84] - - [1024, 3184, 1, 4096] - - [864, 8625.37] + - [938, 8625.37] - - [1024, 3378, 1, 4096] - - [884, 9105.27] + - [958, 9105.27] - - [4096, 3548, 1, 1024] - - [865, 9877.83] + - [939, 9877.83] - - [4096, 2977, 1, 1024] - - [865, 9647.81] + - [939, 9647.81] - - [64, 21, 2976, 21] - - [900, 2364.81] + - [974, 2364.81] - - [64, 112, 576, 111] - - [899, 5973.68] + - [973, 5973.68] - - [4096, 3443, 1, 1024] - - [861, 9784.5] + - [935, 9784.5] - - [1024, 3289, 1, 4096] - - [885, 8874.04] + - [959, 8874.04] - - [1024, 3483, 1, 4096] - - [881, 9380.57] + - [955, 9380.57] - - [4096, 3190, 1, 1024] - - [877, 9850.96] + - [951, 9850.96] - - [1024, 3421, 1, 4096] - - [885, 9214.06] + - [959, 9214.06] - - [1024, 3514, 1, 4096] - - [884, 9458.23] + - [958, 9458.23] - - [1024, 3532, 1, 4096] - - [885, 9513.03] + - [959, 9513.03] - - [1024, 3565, 1, 4096] - - [884, 9630.6] + - [958, 9630.6] - - [4096, 3422, 1, 1024] - - [862, 9733.79] + - [936, 9733.79] - - [4096, 3263, 1, 1024] - - [866, 9776.94] + - [940, 9776.94] - - [4096, 3296, 1, 1024] - - [866, 9860.61] + - [940, 9860.61] - - [4096, 3640, 1, 1024] - - [876, 9782.3] + - [950, 9782.3] - - [4096, 3463, 1, 1024] - - [865, 9672.0] + - [939, 9672.0] - - [4096, 3528, 1, 1024] - - [866, 9829.98] + - [940, 9829.98] - - [1024, 3351, 1, 4096] - - [879, 9054.37] + - [953, 9054.37] - - [1024, 3462, 1, 4096] - - [885, 9327.85] + - [959, 9327.85] - - [4096, 3226, 1, 1024] - - [866, 9674.93] + - [940, 9674.93] - - [4096, 3439, 1, 1024] - - [861, 9823.18] + - [935, 9823.18] - - [4096, 3121, 1, 1024] - - [861, 9672.64] + - [935, 9672.64] - - [1024, 4059, 1, 33708] - - [865, 9885.72] + - [939, 9885.72] - - [1024, 3311, 1, 4096] - - [885, 8910.01] + - [959, 8910.01] - - [1024, 3230, 1, 4096] - - [885, 8705.9] + - [959, 8705.9] - - [4096, 3353, 1, 1024] - - [877, 9671.86] + - [951, 9671.86] - - [4096, 3402, 1, 1024] - - [862, 9727.04] + - [936, 9727.04] - - [1024, 3427, 1, 4096] - - [885, 9233.55] + - [959, 9233.55] - - [1024, 3346, 1, 4096] - - [885, 9015.77] + - [959, 9015.77] - - [1024, 3126, 1, 4096] - - [879, 8519.31] + - [953, 8519.31] - - [1024, 3796, 1, 1024] - - [863, 8916.75] + - [937, 8916.75] - - [1024, 3990, 1, 4096] - - [867, 9600.86] + - [941, 9600.86] - - [1024, 3257, 1, 4096] - - [863, 8790.42] + - [937, 8790.42] - - [4096, 3996, 1, 1024] - - [866, 9788.25] + - [940, 9788.25] - - [64, 143, 432, 143] - - [894, 6087.24] + - [968, 6087.24] - - [1024, 3306, 1, 4096] - - [878, 9035.69] + - [952, 9035.69] - - [1024, 3389, 1, 4096] - - [885, 9134.92] + - [959, 9134.92] - - [1024, 3500, 1, 4096] - - [885, 9443.33] + - [959, 9443.33] - - [1024, 3999, 1, 33708] - - [866, 9741.24] + - [940, 9741.24] - - [4096, 3486, 1, 1024] - - [866, 9719.67] + - [940, 9719.67] - - [1024, 3438, 1, 4096] - - [885, 9259.38] + - [959, 9259.38] - - [4096, 3616, 1, 1024] - - [876, 9739.77] + - [950, 9739.77] - - [1024, 3955, 1, 1024] - - [878, 9260.37] + - [952, 9260.37] - - [4096, 3430, 1, 1024] - - [877, 9819.95] + - [951, 9819.95] - - [4096, 3271, 1, 1024] - - [866, 9802.04] + - [940, 9802.04] - - [1024, 3364, 1, 4096] - - [878, 9144.63] + - [952, 9144.63] - - [64, 54, 1184, 54] - - [898, 4315.78] + - [972, 4315.78] - - [1024, 3497, 1, 4096] - - [885, 9429.42] + - [959, 9429.42] - - [4096, 3503, 1, 1024] - - [865, 9764.48] + - [939, 9764.48] - - [4096, 3344, 1, 1024] - - [862, 9614.16] + - [936, 9614.16] - - [1024, 3457, 1, 4096] - - [885, 9320.6] + - [959, 9320.6] - - [4096, 3466, 1, 1024] - - [865, 9677.81] + - [939, 9677.81] - - [1024, 3976, 1, 33708] - - [866, 9685.38] + - [940, 9685.38] - - [1024, 3395, 1, 4096] - - [884, 9146.39] + - [958, 9146.39] - - [4096, 3361, 1, 1024] - - [876, 9677.89] + - [950, 9677.89] - - [1024, 3751, 1, 33708] - - [874, 9234.69] + - [948, 9234.69] - - [1024, 3822, 1, 1024] - - [863, 8977.83] + - [937, 8977.83] - - [4096, 3315, 1, 1024] - - [866, 9922.54] + - [940, 9922.54] - - [1024, 3163, 1, 4096] - - [878, 8577.79] + - [952, 8577.79] - - [4096, 3547, 1, 1024] - - [866, 9882.92] + - [940, 9882.92] - - [4096, 3340, 1, 1024] - - [876, 9635.42] + - [950, 9635.42] - - [1024, 3296, 1, 4096] - - [885, 8874.66] + - [959, 8874.66] - - [1024, 3468, 1, 4096] - - [885, 9350.26] + - [959, 9350.26] - - [4096, 3294, 1, 1024] - - [865, 9856.87] + - [939, 9856.87] - - [1024, 3406, 1, 4096] - - [881, 9162.84] + - [955, 9162.84] - - [1024, 3860, 1, 33708] - - [865, 9403.56] + - [939, 9403.56] - - [1024, 3584, 1, 4096] - - [882, 9677.44] + - [956, 9677.44] - - [4096, 3189, 1, 1024] - - [877, 9820.69] + - [951, 9820.69] - - [4096, 3494, 1, 1024] - - [865, 9747.68] + - [939, 9747.68] - - [64, 135, 480, 135] - - [891, 5966.34] + - [965, 5966.34] - - [1024, 3093, 1, 4096] - - [879, 8446.06] + - [953, 8446.06] - - [4096, 3421, 1, 1024] - - [862, 9776.03] + - [936, 9776.03] - - [1024, 3479, 1, 4096] - - [885, 9376.54] + - [959, 9376.54] - - [1024, 3433, 1, 4096] - - [885, 9251.14] + - [959, 9251.14] - - [4096, 3311, 1, 1024] - - [865, 9901.53] + - [939, 9901.53] - - [1024, 3381, 1, 4096] - - [885, 9103.99] + - [959, 9103.99] - - [1024, 3996, 1, 4096] - - [866, 9609.56] + - [940, 9609.56] - - [4096, 3384, 1, 1024] - - [876, 9750.01] + - [950, 9750.01] - - [1024, 3247, 1, 4096] - - [864, 8872.59] + - [938, 8872.59] - - [1024, 3169, 1, 4096] - - [863, 8597.61] + - [937, 8597.61] - - [1024, 3088, 1, 4096] - - [879, 8410.07] + - [953, 8410.07] - - [1024, 3363, 1, 4096] - - [885, 9069.5] + - [959, 9069.5] - - [1024, 3538, 1, 4096] - - [884, 9529.68] + - [958, 9529.68] - - [1024, 3996, 1, 1024] - - [868, 9323.06] + - [942, 9323.06] - - [4096, 3169, 1, 1024] - - [862, 9821.4] + - [936, 9821.4] - - [4096, 3538, 1, 1024] - - [865, 9859.42] + - [939, 9859.42] - - [4096, 3401, 1, 1024] - - [862, 9754.5] + - [936, 9754.5] - - [4096, 3581, 1, 1024] - - [865, 9960.71] + - [939, 9960.71] - - [1024, 3180, 1, 4096] - - [863, 8635.05] + - [937, 8635.05] - - [1024, 3870, 1, 1024] - - [864, 9085.69] + - [938, 9085.69] - - [4096, 3555, 1, 1024] - - [865, 9905.74] + - [939, 9905.74] - - [4096, 3412, 1, 1024] - - [877, 9778.56] + - [951, 9778.56] - - [4096, 3302, 1, 1024] - - [865, 9888.71] + - [939, 9888.71] - - [1024, 3561, 1, 4096] - - [881, 9597.05] + - [955, 9597.05] - - [1024, 3302, 1, 4096] - - [885, 8900.87] + - [959, 8900.87] - - [1024, 3976, 1, 4096] - - [867, 9563.22] + - [941, 9563.22] - - [4096, 3485, 1, 1024] - - [865, 9722.57] + - [939, 9722.57] - - [4096, 3534, 1, 1024] - - [865, 9847.22] + - [939, 9847.22] - - [1024, 3110, 1, 4096] - - [878, 8458.56] + - [952, 8458.56] - - [1024, 3401, 1, 4096] - - [885, 9174.81] + - [959, 9174.81] - - [4096, 3216, 1, 1024] - - [865, 9645.49] + - [939, 9645.49] - - [1024, 4020, 1, 33708] - - [865, 9793.61] + - [939, 9793.61] - - [1024, 3215, 1, 4096] - - [885, 8677.51] + - [959, 8677.51] - - [4096, 3566, 1, 1024] - - [865, 9924.78] + - [939, 9924.78] - - [1024, 3137, 1, 4096] - - [863, 8547.07] + - [937, 8547.07] - - [4096, 3359, 1, 1024] - - [862, 9673.73] + - [936, 9673.73] - - [4096, 3392, 1, 1024] - - [877, 9757.51] + - [951, 9757.51] - - [1024, 3506, 1, 4096] - - [885, 9443.0] + - [959, 9443.0] - - [4096, 3233, 1, 1024] - - [865, 9698.7] + - [939, 9698.7] - - [1024, 3444, 1, 4096] - - [885, 9275.54] + - [959, 9275.54] - - [1024, 3975, 1, 4096] - - [866, 9556.87] + - [940, 9556.87] - - [1024, 3870, 1, 33708] - - [865, 9427.44] + - [939, 9427.44] - - [4096, 3465, 1, 1024] - - [866, 9675.01] + - [940, 9675.01] - - [4096, 3968, 1, 1024] - - [862, 9927.93] + - [936, 9927.93] - - [1024, 3523, 1, 4096] - - [885, 9494.15] + - [959, 9494.15] - - [64, 10, 5952, 10] - - [889, 1224.16] + - [963, 1224.16] - - [4096, 3990, 1, 1024] - - [865, 9771.27] + - [939, 9771.27] - - [1024, 3549, 1, 4096] - - [884, 9553.42] + - [958, 9553.42] - - [1024, 3342, 1, 4096] - - [885, 9007.31] + - [959, 9007.31] - - [4096, 3476, 1, 1024] - - [865, 9703.66] + - [939, 9703.66] - - [64, 232, 272, 228] - - [871, 7078.93] + - [945, 7078.93] - - [1024, 3418, 1, 4096] - - [885, 9213.09] + - [959, 9213.09] - - [1024, 3859, 1, 1024] - - [864, 9087.54] + - [938, 9087.54] - - [4096, 3339, 1, 1024] - - [877, 9594.0] + - [951, 9594.0] - - [4096, 3452, 1, 1024] - - [862, 9872.69] + - [936, 9872.69] - - [4096, 3293, 1, 1024] - - [865, 9842.65] + - [939, 9842.65] - - [4096, 3840, 1, 1024] - - [866, 10030.8] + - [940, 10030.8] - - [1024, 3369, 1, 4096] - - [863, 9099.72] + - [937, 9099.72] - - [64, 193, 320, 193] - - [893, 6425.8] + - [967, 6425.8] - - [1024, 3544, 1, 4096] - - [882, 9556.64] + - [956, 9556.64] - - [4096, 3493, 1, 1024] - - [866, 9743.34] + - [940, 9743.34] - - [4096, 3350, 1, 1024] - - [877, 9653.11] + - [951, 9653.11] - - [64, 71, 896, 71] - - [917, 4686.73] + - [991, 4686.73] - - [4096, 3256, 1, 1024] - - [865, 9763.78] + - [939, 9763.78] - - [1024, 3870, 1, 4096] - - [867, 9305.28] + - [941, 9305.28] - - [4096, 4012, 1, 1024] - - [866, 9817.35] + - [940, 9817.35] - - [1024, 3280, 1, 4096] - - [885, 8842.02] + - [959, 8842.02] - - [4096, 3456, 1, 1024] - - [861, 9874.43] + - [935, 9874.43] - - [1024, 3555, 1, 4096] - - [884, 9599.63] + - [958, 9599.63] - - [4096, 3014, 1, 1024] - - [865, 9762.28] + - [939, 9762.28] - - [1024, 3474, 1, 4096] - - [885, 9373.67] + - [959, 9373.67] - - [4096, 3367, 1, 1024] - - [861, 9694.64] + - [935, 9694.64] - - [4096, 3432, 1, 1024] - - [877, 9855.27] + - [951, 9855.27] - - [64, 84, 752, 84] - - [904, 5247.18] + - [978, 5247.18] - - [4096, 3273, 1, 1024] - - [866, 9801.87] + - [940, 9801.87] - - [4096, 3130, 1, 1024] - - [862, 9672.52] + - [936, 9672.52] - - [1024, 2984, 1, 4096] - - [867, 9403.7] + - [941, 9403.7] - - [1024, 3995, 1, 1024] - - [879, 9392.61] + - [953, 9392.61] - - [1024, 3517, 1, 4096] - - [885, 9481.39] + - [959, 9481.39] - - [1024, 3455, 1, 4096] - - [885, 9302.29] + - [959, 9302.29] - - [1024, 3939, 1, 4096] - - [867, 9469.89] + - [941, 9469.89] - - [64, 49, 1296, 49] - - [897, 3938.96] + - [971, 3938.96] - - [64, 14, 4368, 14] - - [889, 1802.47] + - [963, 1802.47] - - [64, 25, 2512, 25] - - [908, 2760.54] + - [982, 2760.54] - - [4096, 3147, 1, 1024] - - [877, 9713.03] + - [951, 9713.03] - - [4096, 3516, 1, 1024] - - [865, 9805.93] + - [939, 9805.93] - - [1024, 3876, 1, 4096] - - [867, 9320.56] + - [941, 9320.56] - - [1024, 3191, 1, 4096] - - [864, 8640.76] + - [938, 8640.76] - - [4096, 3411, 1, 1024] - - [876, 9737.37] + - [950, 9737.37] - - [1024, 3337, 1, 4096] - - [885, 8990.13] + - [959, 8990.13] - - [1024, 3512, 1, 4096] - - [885, 9459.65] + - [959, 9459.65] - - [4096, 3301, 1, 1024] - - [865, 9877.26] + - [939, 9877.26] - - [1024, 3450, 1, 4096] - - [884, 9283.11] + - [958, 9283.11] - - [4096, 3533, 1, 1024] - - [865, 9848.62] + - [939, 9848.62] - - [4096, 3390, 1, 1024] - - [877, 9764.61] + - [951, 9764.61] - - [4096, 3231, 1, 1024] - - [865, 9693.81] + - [939, 9693.81] - - [1024, 2499, 1, 4096] - - [884, 9304.81] + - [958, 9304.81] - - [1024, 3186, 1, 4096] - - [864, 8649.55] + - [938, 8649.55] - - [1024, 3380, 1, 4096] - - [885, 9101.77] + - [959, 9101.77] - - [4096, 3496, 1, 1024] - - [866, 9754.3] + - [940, 9754.3] - - [1024, 3956, 1, 33708] - - [865, 9636.77] + - [939, 9636.77] - - [1024, 3976, 1, 1024] - - [867, 9248.41] + - [941, 9248.41] - - [4096, 2736, 1, 1024] - - [865, 9651.91] + - [939, 9651.91] - - [1024, 3291, 1, 4096] - - [885, 8868.94] + - [959, 8868.94] - - [1024, 3944, 1, 33708] - - [866, 9607.0] + - [940, 9607.0] - - [1024, 3485, 1, 4096] - - [884, 9385.96] + - [958, 9385.96] - - [4096, 3138, 1, 1024] - - [862, 9672.15] + - [936, 9672.15] - - [1024, 3423, 1, 4096] - - [885, 9222.77] + - [959, 9222.77] - - [1024, 3491, 1, 4096] - - [885, 9405.02] + - [959, 9405.02] - - [1024, 3860, 1, 4096] - - [868, 9282.94] + - [942, 9282.94] - - [4096, 3211, 1, 1024] - - [865, 9640.42] + - [939, 9640.42] - - [1024, 3221, 1, 4096] - - [879, 8709.4] + - [953, 8709.4] - - [1024, 2917, 1, 4096] - - [867, 9177.11] + - [941, 9177.11] - - [4096, 3475, 1, 1024] - - [865, 9703.45] + - [939, 9703.45] - - [4096, 3524, 1, 1024] - - [865, 9816.23] + - [939, 9816.23] - - [4096, 2985, 1, 1024] - - [866, 9686.91] + - [940, 9686.91] - - [1024, 3480, 1, 4096] - - [885, 9380.2] + - [959, 9380.2] - - [4096, 3222, 1, 1024] - - [865, 9666.8] + - [939, 9666.8] - - [4096, 3451, 1, 1024] - - [861, 9877.91] + - [935, 9877.91] - - [1024, 3969, 1, 33708] - - [865, 9669.64] + - [939, 9669.64] - - [1024, 3640, 1, 1024] - - [872, 8565.68] + - [946, 8565.68] - - [1024, 3297, 1, 4096] - - [881, 8889.22] + - [955, 8889.22] - - [4096, 3944, 1, 1024] - - [862, 9902.85] + - [936, 9902.85] - - [1024, 3216, 1, 4096] - - [864, 8695.88] + - [938, 8695.88] - - [1024, 3840, 1, 1024] - - [878, 9046.05] + - [952, 9046.05] - - [4096, 3349, 1, 1024] - - [876, 9676.82] + - [950, 9676.82] - - [4096, 3398, 1, 1024] - - [862, 9775.84] + - [936, 9775.84] - - [1024, 3154, 1, 4096] - - [879, 8662.26] + - [953, 8662.26] - - [1024, 3978, 1, 33708] - - [866, 9689.16] + - [940, 9689.16] - - [1024, 3348, 1, 4096] - - [885, 9014.67] + - [959, 9014.67] - - [4096, 3304, 1, 1024] - - [866, 9886.8] + - [940, 9886.8] - - [4096, 4030, 1, 1024] - - [866, 9859.1] + - [940, 9859.1] - - [1024, 4026, 1, 1024] - - [863, 9326.64] + - [937, 9326.64] - - [4096, 3471, 1, 1024] - - [865, 9683.0] + - [939, 9683.0] - - [1024, 3259, 1, 4096] - - [879, 8792.19] + - [953, 8792.19] - - [64, 132, 480, 132] - - [919, 6027.86] + - [993, 6027.86] - - [1024, 3308, 1, 4096] - - [884, 8905.14] + - [958, 8905.14] - - [4096, 3391, 1, 1024] - - [877, 9765.35] + - [951, 9765.35] - - [1024, 3312, 1, 4096] - - [885, 8917.74] + - [959, 8917.74] - - [1024, 3502, 1, 4096] - - [885, 9435.62] + - [959, 9435.62] - - [1024, 3968, 1, 33708] - - [865, 9668.24] + - [939, 9668.24] - - [1024, 3424, 1, 4096] - - [881, 9215.99] + - [955, 9215.99] - - [64, 13, 4672, 13] - - [890, 1662.35] + - [964, 1662.35] - - [4096, 4032, 1, 1024] - - [876, 9877.82] + - [950, 9877.82] - - [1024, 3900, 1, 1024] - - [879, 9116.93] + - [953, 9116.93] - - [4096, 3442, 1, 1024] - - [876, 9773.18] + - [950, 9773.18] - - [1024, 3366, 1, 4096] - - [885, 9079.46] + - [959, 9079.46] - - [4096, 3999, 1, 1024] - - [865, 9786.46] + - [939, 9786.46] - - [1024, 3477, 1, 4096] - - [885, 9364.89] + - [959, 9364.89] - - [1024, 2505, 1, 4096] - - [885, 9304.03] + - [959, 9304.03] - - [4096, 3515, 1, 1024] - - [865, 9797.93] + - [939, 9797.93] - - [1024, 3564, 1, 4096] - - [881, 9632.86] + - [955, 9632.86] - - [4096, 3057, 1, 1024] - - [866, 9880.19] + - [940, 9880.19] - - [1024, 3339, 1, 4096] - - [864, 9029.86] + - [938, 9029.86] - - [4096, 3262, 1, 1024] - - [865, 9780.1] + - [939, 9780.1] - - [1024, 4030, 1, 4096] - - [868, 9682.0] + - [942, 9682.0] - - [1024, 3265, 1, 4096] - - [885, 8797.52] + - [959, 8797.52] - - [1024, 3459, 1, 4096] - - [885, 9313.06] + - [959, 9313.06] - - [4096, 3462, 1, 1024] - - [866, 9669.73] + - [940, 9669.73] - - [64, 85, 752, 85] - - [904, 5186.93] + - [978, 5186.93] - - [1024, 3513, 1, 4096] - - [882, 9469.15] + - [956, 9469.15] - - [1024, 3397, 1, 4096] - - [885, 9151.77] + - [959, 9151.77] - - [4096, 3572, 1, 1024] - - [865, 9945.7] + - [939, 9945.7] - - [4096, 3389, 1, 1024] - - [877, 9740.86] + - [951, 9740.86] - - [4096, 3438, 1, 1024] - - [877, 9822.47] + - [951, 9822.47] - - [64, 102, 624, 100] - - [912, 5487.0] + - [986, 5487.0] - - [1024, 3640, 1, 33708] - - [873, 9083.53] + - [947, 9083.53] - - [1024, 3995, 1, 33708] - - [866, 9731.99] + - [940, 9731.99] - - [1024, 3165, 1, 4096] - - [878, 8601.9] + - [952, 8601.9] - - [4096, 3543, 1, 1024] - - [866, 9868.63] + - [940, 9868.63] - - [4096, 3352, 1, 1024] - - [861, 9668.44] + - [935, 9668.44] - - [1024, 3359, 1, 4096] - - [882, 9050.33] + - [956, 9050.33] - - [1024, 3470, 1, 4096] - - [885, 9355.17] + - [959, 9355.17] - - [64, 15, 4096, 15] - - [889, 1945.43] + - [963, 1945.43] - - [1024, 3392, 1, 4096] - - [884, 9139.71] + - [958, 9139.71] - - [64, 78, 816, 77] - - [896, 4870.56] + - [970, 4870.56] - - [4096, 3137, 1, 1024] - - [861, 9600.22] + - [935, 9600.22] - - [4096, 3506, 1, 1024] - - [866, 9779.08] + - [940, 9779.08] - - [1024, 3095, 1, 4096] - - [878, 8381.24] + - [952, 8381.24] - - [1024, 3859, 1, 4096] - - [865, 9288.63] + - [939, 9288.63] - - [4096, 3369, 1, 1024] - - [877, 9697.73] + - [951, 9697.73] - - [64, 45, 1424, 45] - - [914, 3883.74] + - [988, 3883.74] - - [1024, 3435, 1, 4096] - - [885, 9264.62] + - [959, 9264.62] - - [1024, 3354, 1, 4096] - - [885, 9035.47] + - [959, 9035.47] - - [1024, 3055, 1, 4096] - - [866, 9597.45] + - [940, 9597.45] - - [4096, 3523, 1, 1024] - - [865, 9821.79] + - [939, 9821.79] - - [4096, 3380, 1, 1024] - - [861, 9721.39] + - [935, 9721.39] - - [1024, 3233, 1, 4096] - - [878, 8724.75] + - [952, 8724.75] - - [4096, 3221, 1, 1024] - - [865, 9661.04] + - [939, 9661.04] - - [4096, 3270, 1, 1024] - - [865, 9797.92] + - [939, 9797.92] - - [4096, 3593, 1, 1024] - - [876, 9679.31] + - [950, 9679.31] - - [1024, 3358, 1, 4096] - - [885, 9051.82] + - [959, 9051.82] - - [1024, 3540, 1, 4096] - - [885, 9533.59] + - [959, 9533.59] - - [4096, 3502, 1, 1024] - - [866, 9760.65] + - [940, 9760.65] - - [4096, 2505, 1, 1024] - - [866, 9680.52] + - [940, 9680.52] - - [4096, 3397, 1, 1024] - - [876, 9785.85] + - [950, 9785.85] - - [1024, 3300, 1, 4096] - - [879, 8907.85] + - [953, 8907.85] - - [4096, 3095, 1, 1024] - - [862, 9618.78] + - [936, 9618.78] - - [1024, 3182, 1, 4096] - - [878, 8606.16] + - [952, 8606.16] - - [1024, 3299, 1, 4096] - - [884, 8885.48] + - [958, 8885.48] - - [1024, 3276, 1, 4096] - - [879, 8872.75] + - [953, 8872.75] - - [1024, 3360, 1, 4096] - - [882, 9044.2] + - [956, 9044.2] - - [4096, 3360, 1, 1024] - - [877, 9681.39] + - [951, 9681.39] - - [4096, 2918, 1, 1024] - - [861, 9732.74] + - [935, 9732.74] - - [1024, 3939, 1, 33708] - - [865, 9595.96] + - [939, 9595.96] - - [4096, 3314, 1, 1024] - - [866, 9915.02] + - [940, 9915.02] - - [1024, 3319, 1, 4096] - - [885, 8956.37] + - [959, 8956.37] - - [64, 35, 1808, 35] - - [902, 3060.27] + - [976, 3060.27] - - [1024, 3942, 1, 1024] - - [878, 9211.83] + - [952, 9211.83] - - [1024, 3465, 1, 4096] - - [885, 9340.73] + - [959, 9340.73] - - [4096, 3546, 1, 1024] - - [866, 9875.41] + - [940, 9875.41] - - [1024, 3403, 1, 4096] - - [878, 9224.34] + - [952, 9224.34] - - [1024, 3948, 1, 1024] - - [864, 9245.63] + - [938, 9245.63] - - [4096, 3441, 1, 1024] - - [877, 9758.72] + - [951, 9758.72] - - [1024, 3139, 1, 4096] - - [878, 8582.84] + - [952, 8582.84] - - [1024, 3563, 1, 4096] - - [885, 9620.74] + - [959, 9620.74] - - [1024, 3508, 1, 4096] - - [882, 9449.36] + - [956, 9449.36] - - [1024, 3975, 1, 33708] - - [865, 9683.55] + - [939, 9683.55] - - [1024, 3446, 1, 4096] - - [884, 9289.51] + - [958, 9289.51] - - [1024, 3529, 1, 4096] - - [881, 9491.29] + - [955, 9491.29] - - [64, 112, 576, 112] - - [906, 6387.14] + - [980, 6387.14] - - [4096, 3461, 1, 1024] - - [866, 9663.33] + - [940, 9663.33] - - [1024, 3574, 1, 4096] - - [884, 9662.88] + - [958, 9662.88] - - [1024, 3101, 1, 4096] - - [879, 8468.34] + - [953, 8468.34] - - [1024, 3927, 1, 1024] - - [864, 9207.97] + - [938, 9207.97] - - [4096, 3224, 1, 1024] - - [866, 9665.61] + - [940, 9665.61] - - [4096, 3437, 1, 1024] - - [862, 9857.21] + - [936, 9857.21] - - [4096, 3900, 1, 1024] - - [877, 9826.25] + - [951, 9826.25] - - [1024, 3495, 1, 4096] - - [885, 9412.41] + - [959, 9412.41] - - [1024, 3977, 1, 33708] - - [865, 9687.87] + - [939, 9687.87] - - [1024, 3328, 1, 4096] - - [885, 8975.57] + - [959, 8975.57] - - [4096, 3168, 1, 1024] - - [861, 9754.87] + - [935, 9754.87] - - [1024, 4026, 1, 33708] - - [865, 9807.24] + - [939, 9807.24] - - [1024, 3292, 1, 4096] - - [878, 8901.83] + - [952, 8901.83] - - [1024, 3294, 1, 4096] - - [885, 8877.03] + - [959, 8877.03] - - [4096, 3335, 1, 1024] - - [862, 9616.23] + - [936, 9616.23] - - [4096, 3400, 1, 1024] - - [876, 9710.73] + - [950, 9710.73] - - [1024, 3287, 1, 4096] - - [863, 8908.07] + - [937, 8908.07] - - [1024, 3910, 1, 4096] - - [867, 9401.03] + - [941, 9401.03] - - [1024, 3780, 1, 1024] - - [878, 8863.29] + - [952, 8863.29] - - [4096, 3098, 1, 1024] - - [862, 9606.47] + - [936, 9606.47] - - [1024, 3584, 1, 33708] - - [885, 9775.33] + - [959, 9775.33] - - [64, 29, 2176, 29] - - [907, 3135.03] + - [981, 3135.03] - - [1024, 3371, 1, 4096] - - [863, 9117.81] + - [937, 9117.81] - - [1024, 3546, 1, 4096] - - [885, 9547.3] + - [959, 9547.3] - - [1024, 4012, 1, 1024] - - [867, 9353.73] + - [941, 9353.73] - - [4096, 3505, 1, 1024] - - [865, 9773.17] + - [939, 9773.17] - - [4096, 3554, 1, 1024] - - [865, 9895.59] + - [939, 9895.59] - - [4096, 3063, 1, 1024] - - [865, 9898.98] + - [939, 9898.98] - - [1024, 3900, 1, 33708] - - [866, 9502.93] + - [940, 9502.93] - - [1024, 3345, 1, 4096] - - [885, 9015.85] + - [959, 9015.85] - - [1024, 3357, 1, 4096] - - [885, 9041.23] + - [959, 9041.23] - - [1024, 3282, 1, 4096] - - [878, 8860.17] + - [952, 8860.17] - - [4096, 3484, 1, 1024] - - [866, 9721.33] + - [940, 9721.33] - - [1024, 3557, 1, 4096] - - [882, 9573.48] + - [956, 9573.48] - - [1024, 3476, 1, 4096] - - [885, 9361.72] + - [959, 9361.72] - - [1024, 3751, 1, 1024] - - [879, 8849.11] + - [953, 8849.11] - - [4096, 3379, 1, 1024] - - [862, 9741.49] + - [936, 9741.49] - - [4096, 3428, 1, 1024] - - [861, 9767.82] + - [935, 9767.82] - - [4096, 3126, 1, 1024] - - [876, 9701.9] + - [950, 9701.9] - - [64, 41, 1552, 41] - - [911, 3555.69] + - [985, 3555.69] - - [1024, 3325, 1, 4096] - - [863, 8962.41] + - [937, 8962.41] - - [4096, 3501, 1, 1024] - - [865, 9762.01] + - [939, 9762.01] - - [4096, 3358, 1, 1024] - - [861, 9680.42] + - [935, 9680.42] - - [1024, 3441, 1, 4096] - - [885, 9271.27] + - [959, 9271.27] - - [1024, 3552, 1, 4096] - - [881, 9565.42] + - [955, 9565.42] - - [4096, 3232, 1, 1024] - - [866, 9696.81] + - [940, 9696.81] - - [64, 18, 3440, 18] - - [886, 2059.33] + - [960, 2059.33] - - [1024, 3412, 1, 4096] - - [885, 9199.28] + - [959, 9199.28] - - [1024, 3372, 1, 4096] - - [882, 9083.49] + - [956, 9083.49] - - [1024, 3585, 1, 4096] - - [872, 8710.29] + - [946, 8710.29] - - [4096, 3143, 1, 1024] - - [877, 9692.12] + - [951, 9692.12] - - [4096, 3464, 1, 1024] - - [865, 9661.93] + - [939, 9661.93] - - [1024, 3145, 1, 4096] - - [864, 8526.33] + - [938, 8526.33] - - [4096, 3375, 1, 1024] - - [876, 9734.78] + - [950, 9734.78] - - [4096, 2917, 1, 1024] - - [861, 9714.57] + - [935, 9714.57] - - [4096, 3978, 1, 1024] - - [866, 9741.43] + - [940, 9741.43] - - [1024, 2765, 1, 4096] - - [867, 8706.75] + - [941, 8706.75] - - [64, 148, 432, 148] - - [892, 6372.17] + - [966, 6372.17] - - [1024, 3452, 1, 4096] - - [884, 9301.38] + - [958, 9301.38] - - [4096, 3584, 1, 1024] - - [866, 10005.7] + - [940, 10005.7] - - [4096, 3545, 1, 1024] - - [866, 9877.87] + - [940, 9877.87] - - [1024, 3352, 1, 4096] - - [885, 9035.19] + - [959, 9035.19] - - [64, 159, 400, 160] - - [894, 6952.11] + - [968, 6952.11] - - [4096, 3292, 1, 1024] - - [865, 9856.51] + - [939, 9856.51] - - [1024, 3525, 1, 4096] - - [885, 9501.5] + - [959, 9501.5] - - [1024, 3266, 1, 4096] - - [885, 8817.43] + - [959, 8817.43] - - [1024, 3382, 1, 4096] - - [884, 9101.54] + - [958, 9101.54] - - [4096, 3492, 1, 1024] - - [865, 9747.29] + - [939, 9747.29] - - [4096, 3419, 1, 1024] - - [877, 9745.88] + - [951, 9745.88] - - [1024, 3796, 1, 33708] - - [874, 9356.26] + - [948, 9356.26] - - [1024, 3293, 1, 4096] - - [881, 8868.4] + - [955, 8868.4] - - [4096, 3796, 1, 1024] - - [866, 9885.36] + - [940, 9885.36] - - [1024, 3487, 1, 4096] - - [882, 9391.34] + - [956, 9391.34] - - [4096, 3166, 1, 1024] - - [877, 9718.46] + - [951, 9718.46] - - [64, 102, 624, 101] - - [906, 5547.84] + - [980, 5547.84] - - [1024, 3409, 1, 4096] - - [885, 9187.88] + - [959, 9187.88] - - [1024, 3520, 1, 4096] - - [884, 9485.09] + - [958, 9485.09] - - [1024, 3573, 1, 4096] - - [885, 9652.71] + - [959, 9652.71] - - [4096, 3366, 1, 1024] - - [861, 9684.31] + - [935, 9684.31] - - [4096, 3720, 1, 1024] - - [877, 9703.34] + - [951, 9703.34] - - [4096, 3207, 1, 1024] - - [865, 9626.21] + - [939, 9626.21] - - [4096, 3272, 1, 1024] - - [865, 9795.51] + - [939, 9795.51] - - [1024, 3390, 1, 4096] - - [885, 9125.88] + - [959, 9125.88] - - [4096, 3183, 1, 1024] - - [877, 9825.87] + - [951, 9825.87] - - [4096, 3536, 1, 1024] - - [866, 9846.51] + - [940, 9846.51] - - [4096, 3563, 1, 1024] - - [866, 9913.8] + - [940, 9913.8] - - [1024, 3482, 1, 4096] - - [885, 9376.91] + - [959, 9376.91] - - [4096, 3447, 1, 1024] - - [876, 9875.09] + - [950, 9875.09] - - [4096, 3955, 1, 1024] - - [861, 9922.39] + - [935, 9922.39] - - [4096, 4005, 1, 1024] - - [866, 9803.43] + - [940, 9803.43] - - [1024, 3493, 1, 4096] - - [885, 9411.37] + - [959, 9411.37] - - [4096, 3410, 1, 1024] - - [861, 9788.34] + - [935, 9788.34] - - [1024, 3422, 1, 4096] - - [884, 9216.28] + - [958, 9216.28] - - [1024, 3350, 1, 4096] - - [879, 9068.02] + - [953, 9068.02] - - [4096, 3300, 1, 1024] - - [866, 9883.29] + - [940, 9883.29] - - [4096, 3910, 1, 1024] - - [876, 9800.12] + - [950, 9800.12] - - [1024, 3489, 1, 4096] - - [885, 9398.66] + - [959, 9398.66] - - [4096, 3483, 1, 1024] - - [865, 9715.96] + - [939, 9715.96] - - [4096, 3532, 1, 1024] - - [866, 9837.99] + - [940, 9837.99] - - [64, 101, 624, 101] - - [906, 5452.28] + - [980, 5452.28] - - [4096, 3230, 1, 1024] - - [866, 9683.6] + - [940, 9683.6] - - [4096, 3427, 1, 1024] - - [861, 9760.72] + - [935, 9760.72] - - [1024, 3377, 1, 4096] - - [885, 9101.17] + - [959, 9101.17] - - [1024, 3488, 1, 4096] - - [884, 9381.99] + - [958, 9381.99] - - [1024, 3616, 1, 4096] - - [867, 8709.33] + - [941, 8709.33] - - [1024, 3426, 1, 4096] - - [885, 9229.43] + - [959, 9229.43] - - [4096, 3357, 1, 1024] - - [877, 9668.5] + - [951, 9668.5] - - [4096, 3406, 1, 1024] - - [862, 9748.57] + - [936, 9748.57] - - [1024, 3046, 1, 4096] - - [867, 9590.43] + - [941, 9590.43] - - [1024, 3272, 1, 4096] - - [878, 8930.2] + - [952, 8930.2] - - [1024, 3256, 1, 4096] - - [863, 8828.16] + - [937, 8828.16] - - [4096, 3247, 1, 1024] - - [865, 9741.81] + - [939, 9741.81] - - [4096, 3088, 1, 1024] - - [877, 9589.07] + - [951, 9589.07] - - [1024, 3531, 1, 4096] - - [884, 9501.06] + - [958, 9501.06] - - [64, 160, 400, 160] - - [920, 7334.03] + - [994, 7334.03] - - [4096, 3511, 1, 1024] - - [866, 9789.38] + - [940, 9789.38] - - [1024, 3720, 1, 33708] - - [875, 9214.68] + - [949, 9214.68] - - [1024, 3267, 1, 4096] - - [878, 8831.04] + - [952, 8831.04] - - [1024, 3270, 1, 4096] - - [879, 8876.68] + - [953, 8876.68] - - [1024, 3461, 1, 4096] - - [884, 9327.55] + - [958, 9327.55] - - [4096, 3474, 1, 1024] - - [865, 9697.04] + - [939, 9697.04] - - [4096, 2984, 1, 1024] - - [866, 9674.08] + - [940, 9674.08] - - [1024, 3399, 1, 4096] - - [884, 9158.58] + - [958, 9158.58] - - [4096, 3574, 1, 1024] - - [865, 9942.3] + - [939, 9942.3] - - [1024, 3876, 1, 1024] - - [879, 9085.13] + - [953, 9085.13] - - [4096, 3337, 1, 1024] - - [862, 9611.43] + - [936, 9611.43] - - [4096, 3450, 1, 1024] - - [877, 9930.35] + - [951, 9930.35] - - [1024, 3720, 1, 1024] - - [863, 8755.49] + - [937, 8755.49] - - [1024, 4059, 1, 1024] - - [868, 9366.67] + - [942, 9366.67] - - [4096, 3291, 1, 1024] - - [865, 9856.33] + - [939, 9856.33] - - [64, 93, 688, 93] - - [909, 5497.11] + - [983, 5497.11] - - [4096, 3995, 1, 1024] - - [865, 9776.67] + - [939, 9776.67] - - [64, 147, 432, 147] - - [895, 6233.88] + - [969, 6233.88] - - [4096, 3491, 1, 1024] - - [865, 9742.94] + - [939, 9742.94] - - [4096, 3348, 1, 1024] - - [877, 9634.11] + - [951, 9634.11] - - [4096, 3925, 1, 1024] - - [876, 9848.54] + - [950, 9848.54] - - [4096, 3894, 1, 1024] - - [876, 9812.55] + - [950, 9812.55] - - [1024, 3456, 1, 4096] - - [885, 9317.91] + - [959, 9317.91] - - [1024, 3394, 1, 4096] - - [884, 9148.86] + - [958, 9148.86] - - [64, 100, 624, 102] - - [906, 5416.95] + - [980, 5416.95] - - [4096, 3165, 1, 1024] - - [876, 9743.35] + - [950, 9743.35] - - [4096, 3470, 1, 1024] - - [866, 9691.04] + - [940, 9691.04] - - [1024, 3014, 1, 4096] - - [867, 9486.26] + - [941, 9486.26] - - [1024, 3375, 1, 4096] - - [885, 9082.71] + - [959, 9082.71] - - [4096, 3859, 1, 1024] - - [876, 9738.87] + - [950, 9738.87] - - [4096, 3365, 1, 1024] - - [877, 9694.74] + - [951, 9694.74] - - [1024, 3162, 1, 4096] - - [878, 8550.31] + - [952, 8550.31] - - [1024, 3840, 1, 33708] - - [875, 9409.08] + - [949, 9409.08] - - [1024, 3437, 1, 4096] - - [885, 9270.49] + - [959, 9270.49] - - [4096, 3319, 1, 1024] - - [866, 9927.15] + - [940, 9927.15] - - [1024, 3320, 1, 4096] - - [885, 8962.29] + - [959, 8962.29] - - [64, 23, 2720, 23] - - [908, 2569.53] + - [982, 2569.53] - - [4096, 3328, 1, 1024] - - [865, 9997.41] + - [939, 9997.41] - - [1024, 3235, 1, 4096] - - [885, 8724.31] + - [959, 8724.31] - - [4096, 3282, 1, 1024] - - [866, 9827.13] + - [940, 9827.13] - - [1024, 3367, 1, 4096] - - [878, 9084.02] + - [952, 9084.02] - - [1024, 3542, 1, 4096] - - [885, 9533.1] + - [959, 9533.1] - - [64, 177, 352, 177] - - [871, 6817.91] + - [945, 6817.91] - - [4096, 3145, 1, 1024] - - [862, 9710.28] + - [936, 9710.28] - - [4096, 3514, 1, 1024] - - [865, 9793.06] + - [939, 9793.06] - - [1024, 3432, 1, 4096] - - [885, 9249.39] + - [959, 9249.39] - - [4096, 3409, 1, 1024] - - [861, 9721.6] + - [935, 9721.6] - - [1024, 4012, 1, 33708] - - [865, 9773.35] + - [939, 9773.35] - - [4096, 3876, 1, 1024] - - [862, 9745.65] + - [936, 9745.65] - - [4096, 3299, 1, 1024] - - [865, 9873.53] + - [939, 9873.53] - - [1024, 3168, 1, 4096] - - [878, 8597.13] + - [952, 8597.13] - - [4096, 3681, 1, 1024] - - [877, 9840.03] + - [951, 9840.03] - - [4096, 3531, 1, 1024] - - [866, 9847.76] + - [940, 9847.76] - - [4096, 3388, 1, 1024] - - [877, 9772.28] + - [951, 9772.28] - - [1024, 3720, 1, 4096] - - [866, 8951.6] + - [940, 8951.6] - - [1024, 3332, 1, 4096] - - [885, 8978.97] + - [959, 8978.97] - - [1024, 3273, 1, 4096] - - [879, 8982.49] + - [953, 8982.49] - - [1024, 2935, 1, 4096] - - [868, 9224.89] + - [942, 9224.89] - - [1024, 3467, 1, 4096] - - [882, 9329.33] + - [956, 9329.33] - - [4096, 3542, 1, 1024] - - [865, 9858.51] + - [939, 9858.51] - - [1024, 3130, 1, 4096] - - [864, 8526.66] + - [938, 8526.66] - - [1024, 3405, 1, 4096] - - [885, 9163.44] + - [959, 9163.44] - - [1024, 3960, 1, 1024] - - [863, 9280.36] + - [937, 9280.36] - - [4096, 3405, 1, 1024] - - [876, 9710.2] + - [950, 9710.2] - - [512, 512, 1, 1024] - - [1062, 6670.96] + - [1136, 6670.96] - - [8, 500, 1, 512] - - [958, 228.671] + - [1032, 228.671] - - [512, 512, 1, 2000] - - [1095, 7629.44] + - [1169, 7629.44] - - [32, 512, 1, 512] - - [955, 904.045] + - [1029, 904.045] - - [100, 1024, 1, 2048] - - [1017, 3196.98] + - [1091, 3196.98] - - [8, 512, 1, 500] - - [948, 237.137] + - [1022, 237.137] - - [8, 500, 1, 1024] - - [1012, 289.366] + - [1086, 289.366] - - [100, 2000, 1, 1024] - - [1051, 3368.52] + - [1125, 3368.52] - - [64, 1024, 1, 100] - - [950, 941.709] + - [1024, 941.709] - - [64, 1024, 1, 500] - - [1077, 2659.84] + - [1151, 2659.84] - - [64, 1024, 1, 1024] - - [1015, 2452.91] + - [1089, 2452.91] - - [128, 2000, 1, 100] - - [1071, 2560.1] + - [1145, 2560.1] - - [2, 500, 1, 2048] - - [1012, 72.2127] + - [1086, 72.2127] - - [16, 512, 1, 10] - - [926, 18.3857] + - [1000, 18.3857] - - [64, 2000, 1, 1024] - - [1082, 2800.78] + - [1156, 2800.78] - - [100, 1024, 1, 1024] - - [1010, 3034.17] + - [1084, 3034.17] - - [8, 512, 1, 10] - - [988, 9.24286] + - [1062, 9.24286] - - [16, 500, 1, 2048] - - [1012, 565.846] + - [1086, 565.846] - - [10, 100, 1, 500] - - [948, 58.5112] + - [1022, 58.5112] - - [16, 100, 1, 10] - - [988, 3.67143] + - [1062, 3.67143] - - [500, 1024, 1, 512] - - [1078, 6514.61] + - [1152, 6514.61] - - [128, 1024, 1, 512] - - [1096, 4194.4] + - [1170, 4194.4] - - [512, 500, 1, 2000] - - [1054, 7347.98] + - [1128, 7347.98] - - [2, 100, 1, 2000] - - [948, 20.9333] + - [1022, 20.9333] - - [500, 512, 1, 100] - - [1070, 2539.78] + - [1144, 2539.78] - - [100, 1024, 1, 500] - - [1096, 3216.18] + - [1170, 3216.18] - - [256, 100, 1, 2048] - - [1106, 1689.17] + - [1180, 1689.17] - - [2, 512, 1, 512] - - [962, 50.5123] + - [1036, 50.5123] - - [128, 2000, 1, 512] - - [1082, 4641.46] + - [1156, 4641.46] - - [2, 100, 1, 10] - - [926, 0.496825] + - [1000, 0.496825] - - [16, 2000, 1, 2048] - - [970, 1266.25] + - [1044, 1266.25] - - [200, 100, 1, 100] - - [1116, 316.556] + - [1190, 316.556] - - [256, 1024, 1, 100] - - [1072, 2686.0] + - [1146, 2686.0] - - [200, 500, 1, 1024] - - [1121, 3282.15] + - [1195, 3282.15] - - [500, 100, 1, 100] - - [1035, 631.413] + - [1109, 631.413] - - [4, 100, 1, 10] - - [933, 0.977193] + - [1007, 0.977193] - - [32, 100, 1, 512] - - [1012, 198.935] + - [1086, 198.935] - - [100, 2000, 1, 512] - - [1082, 3832.44] + - [1156, 3832.44] - - [16, 1024, 1, 512] - - [996, 794.476] + - [1070, 794.476] - - [200, 512, 1, 100] - - [1114, 1306.22] + - [1188, 1306.22] - - [4, 1024, 1, 1024] - - [955, 213.225] + - [1029, 213.225] - - [512, 1024, 1, 512] - - [1079, 7049.35] + - [1153, 7049.35] - - [4, 512, 1, 10] - - [987, 4.59123] + - [1061, 4.59123] - - [2, 2048, 1, 2000] - - [948, 300.393] + - [1022, 300.393] - - [64, 2048, 1, 10] - - [1108, 241.041] + - [1182, 241.041] - - [128, 100, 1, 10] - - [1113, 27.6862] + - [1187, 27.6862] - - [4, 512, 1, 2048] - - [948, 146.549] + - [1022, 146.549] - - [64, 2048, 1, 500] - - [1088, 4015.79] + - [1162, 4015.79] - - [512, 512, 1, 512] - - [1043, 6123.17] + - [1117, 6123.17] - - [500, 500, 1, 2000] - - [1054, 7126.67] + - [1128, 7126.67] - - [10, 1024, 1, 2000] - - [1021, 807.671] + - [1095, 807.671] - - [256, 100, 1, 100] - - [1033, 296.396] + - [1107, 296.396] - - [32, 2000, 1, 2048] - - [976, 2167.3] + - [1050, 2167.3] - - [64, 1024, 1, 2048] - - [1009, 2383.23] + - [1083, 2383.23] - - [200, 2048, 1, 512] - - [1084, 5264.04] + - [1158, 5264.04] - - [256, 500, 1, 10] - - [1066, 210.626] + - [1140, 210.626] - - [16, 1024, 1, 100] - - [946, 262.664] + - [1020, 262.664] - - [32, 1024, 1, 1024] - - [951, 1476.97] + - [1025, 1476.97] - - [512, 500, 1, 512] - - [1040, 5851.53] + - [1114, 5851.53] - - [128, 1024, 1, 2000] - - [1124, 5516.6] + - [1198, 5516.6] - - [8, 100, 1, 500] - - [948, 46.3963] + - [1022, 46.3963] - - [100, 2000, 1, 2048] - - [1103, 3715.63] + - [1177, 3715.63] - - [10, 512, 1, 512] - - [958, 292.671] + - [1032, 292.671] - - [8, 500, 1, 10] - - [987, 8.87193] + - [1061, 8.87193] - - [10, 2000, 1, 1024] - - [1001, 640.1] + - [1075, 640.1] - - [16, 1024, 1, 10] - - [986, 36.6714] + - [1060, 36.6714] - - [16, 512, 1, 2048] - - [965, 585.897] + - [1039, 585.897] - - [256, 512, 1, 10] - - [1031, 230.861] + - [1105, 230.861] - - [2, 2000, 1, 100] - - [993, 64.2026] + - [1067, 64.2026] - - [128, 512, 1, 2048] - - [960, 3106.99] + - [1034, 3106.99] - - [128, 512, 1, 100] - - [953, 952.658] + - [1027, 952.658] - - [512, 2000, 1, 1024] - - [1050, 8066.07] + - [1124, 8066.07] - - [64, 500, 1, 2048] - - [1119, 1857.7] + - [1193, 1857.7] - - [64, 2000, 1, 2048] - - [1101, 3442.12] + - [1175, 3442.12] - - [64, 2048, 1, 512] - - [1102, 3315.76] + - [1176, 3315.76] - - [10, 2000, 1, 512] - - [948, 785.376] + - [1022, 785.376] - - [32, 2000, 1, 500] - - [951, 2500.1] + - [1025, 2500.1] - - [64, 2000, 1, 10] - - [939, 231.984] + - [1013, 231.984] - - [500, 100, 1, 10] - - [1036, 88.1282] + - [1110, 88.1282] - - [128, 1024, 1, 500] - - [1087, 4096.1] + - [1161, 4096.1] - - [64, 100, 1, 2048] - - [948, 587.34] + - [1022, 587.34] - - [64, 100, 1, 10] - - [1107, 12.0403] + - [1181, 12.0403] - - [16, 512, 1, 500] - - [958, 461.361] + - [1032, 461.361] - - [32, 2000, 1, 1024] - - [945, 1713.91] + - [1019, 1713.91] - - [200, 512, 1, 1024] - - [1124, 3244.46] + - [1198, 3244.46] - - [128, 2048, 1, 10] - - [940, 455.211] + - [1014, 455.211] - - [200, 100, 1, 2000] - - [948, 1462.09] + - [1022, 1462.09] - - [2, 100, 1, 512] - - [948, 12.5272] + - [1022, 12.5272] - - [64, 2048, 1, 100] - - [1114, 1689.17] + - [1188, 1689.17] - - [32, 512, 1, 100] - - [947, 266.074] + - [1021, 266.074] - - [16, 512, 1, 1024] - - [1012, 569.978] + - [1086, 569.978] - - [4, 1024, 1, 512] - - [1002, 208.151] + - [1076, 208.151] - - [64, 2000, 1, 100] - - [1114, 1649.58] + - [1188, 1649.58] - - [512, 2048, 1, 512] - - [1050, 7849.09] + - [1124, 7849.09] - - [2, 500, 1, 500] - - [936, 53.5188] + - [1010, 53.5188] - - [32, 100, 1, 100] - - [947, 57.2429] + - [1021, 57.2429] - - [100, 500, 1, 2000] - - [951, 2784.06] + - [1025, 2784.06] - - [200, 2000, 1, 100] - - [1023, 2994.11] + - [1097, 2994.11] - - [10, 512, 1, 10] - - [983, 11.1345] + - [1057, 11.1345] - - [100, 500, 1, 2048] - - [1123, 2361.72] + - [1197, 2361.72] - - [4, 2048, 1, 500] - - [958, 379.359] + - [1032, 379.359] - - [200, 500, 1, 100] - - [1084, 1288.76] + - [1158, 1288.76] - - [500, 500, 1, 500] - - [1040, 5425.45] + - [1114, 5425.45] - - [2, 100, 1, 1024] - - [1012, 16.3025] + - [1086, 16.3025] - - [128, 2048, 1, 512] - - [1098, 4699.6] + - [1172, 4699.6] - - [200, 2000, 1, 1024] - - [1048, 4621.04] + - [1122, 4621.04] - - [32, 512, 1, 1024] - - [1011, 1028.12] + - [1085, 1028.12] - - [100, 2048, 1, 500] - - [1072, 4142.49] + - [1146, 4142.49] - - [256, 100, 1, 1024] - - [1102, 1443.62] + - [1176, 1443.62] - - [16, 2000, 1, 500] - - [997, 1428.67] + - [1071, 1428.67] - - [128, 100, 1, 100] - - [947, 213.433] + - [1021, 213.433] - - [500, 500, 1, 2048] - - [1044, 6639.1] + - [1118, 6639.1] - - [32, 512, 1, 10] - - [980, 36.0298] + - [1054, 36.0298] - - [128, 100, 1, 1024] - - [1008, 791.598] + - [1082, 791.598] - - [16, 500, 1, 2000] - - [1021, 694.544] + - [1095, 694.544] - - [4, 2048, 1, 100] - - [992, 129.72] + - [1066, 129.72] - - [64, 500, 1, 500] - - [934, 1333.43] + - [1008, 1333.43] - - [500, 1024, 1, 2048] - - [1053, 7031.86] + - [1127, 7031.86] - - [512, 2048, 1, 100] - - [1028, 5285.26] + - [1102, 5285.26] - - [128, 512, 1, 1024] - - [1120, 2519.2] + - [1194, 2519.2] - - [128, 512, 1, 2000] - - [1118, 3608.91] + - [1192, 3608.91] - - [128, 2000, 1, 2000] - - [1091, 7017.64] + - [1165, 7017.64] - - [2, 512, 1, 10] - - [984, 2.13175] + - [1058, 2.13175] - - [10, 512, 1, 500] - - [948, 293.678] + - [1022, 293.678] - - [4, 1024, 1, 2000] - - [968, 326.215] + - [1042, 326.215] - - [256, 100, 1, 2000] - - [1105, 1768.06] + - [1179, 1768.06] - - [512, 2048, 1, 2000] - - [1050, 8674.62] + - [1124, 8674.62] - - [100, 100, 1, 10] - - [1112, 21.6517] + - [1186, 21.6517] - - [256, 500, 1, 1024] - - [1052, 4833.14] + - [1126, 4833.14] - - [128, 512, 1, 10] - - [940, 132.229] + - [1014, 132.229] - - [256, 100, 1, 500] - - [1099, 914.386] + - [1173, 914.386] - - [64, 100, 1, 512] - - [1006, 369.109] + - [1080, 369.109] - - [64, 512, 1, 500] - - [948, 1600.1] + - [1022, 1600.1] - - [64, 2048, 1, 2000] - - [1102, 5925.6] + - [1176, 5925.6] - - [100, 2048, 1, 1024] - - [1060, 3260.6] + - [1134, 3260.6] - - [200, 2000, 1, 10] - - [940, 595.338] + - [1014, 595.338] - - [128, 1024, 1, 100] - - [1084, 1689.17] + - [1158, 1689.17] - - [16, 2000, 1, 100] - - [947, 493.927] + - [1021, 493.927] - - [8, 100, 1, 512] - - [948, 49.8087] + - [1022, 49.8087] - - [500, 2048, 1, 1024] - - [1050, 7651.71] + - [1124, 7651.71] - - [500, 2000, 1, 10] - - [1038, 1008.16] + - [1112, 1008.16] - - [32, 100, 1, 500] - - [1012, 187.016] + - [1086, 187.016] - - [256, 1024, 1, 2048] - - [1053, 6190.95] + - [1127, 6190.95] - - [32, 500, 1, 2048] - - [948, 1083.7] + - [1022, 1083.7] - - [4, 2000, 1, 10] - - [991, 17.6439] + - [1065, 17.6439] - - [128, 500, 1, 2000] - - [1008, 3516.58] + - [1082, 3516.58] - - [8, 1024, 1, 10] - - [982, 18.0649] + - [1056, 18.0649] - - [2, 500, 1, 100] - - [927, 16.1256] + - [1001, 16.1256] - - [10, 500, 1, 512] - - [948, 291.009] + - [1022, 291.009] - - [10, 2000, 1, 10] - - [926, 38.5615] + - [1000, 38.5615] - - [500, 512, 1, 512] - - [1043, 5893.63] + - [1117, 5893.63] - - [32, 500, 1, 500] - - [948, 892.957] + - [1022, 892.957] - - [256, 500, 1, 2000] - - [1057, 6237.92] + - [1131, 6237.92] - - [100, 500, 1, 100] - - [959, 726.844] + - [1033, 726.844] - - [500, 2048, 1, 100] - - [1032, 4867.02] + - [1106, 4867.02] - - [10, 1024, 1, 512] - - [948, 520.227] + - [1022, 520.227] - - [2, 2048, 1, 512] - - [958, 151.628] + - [1032, 151.628] - - [256, 512, 1, 100] - - [1037, 1590.78] + - [1111, 1590.78] - - [10, 2048, 1, 100] - - [948, 324.151] + - [1022, 324.151] - - [8, 2048, 1, 100] - - [1003, 256.1] + - [1077, 256.1] - - [512, 100, 1, 512] - - [1099, 2100.61] + - [1173, 2100.61] - - [4, 500, 1, 500] - - [948, 115.841] + - [1022, 115.841] - - [64, 100, 1, 1024] - - [948, 450.21] + - [1022, 450.21] - - [2, 2048, 1, 1024] - - [1005, 137.708] + - [1079, 137.708] - - [2, 500, 1, 2000] - - [974, 90.3527] + - [1048, 90.3527] - - [512, 1024, 1, 500] - - [1079, 6898.63] + - [1153, 6898.63] - - [128, 2000, 1, 500] - - [1084, 5161.39] + - [1158, 5161.39] - - [32, 512, 1, 2048] - - [1018, 1103.86] + - [1092, 1103.86] - - [10, 100, 1, 2000] - - [948, 106.032] + - [1022, 106.032] - - [4, 100, 1, 512] - - [948, 24.7154] + - [1022, 24.7154] - - [2, 512, 1, 2048] - - [1012, 73.3246] + - [1086, 73.3246] - - [200, 512, 1, 2048] - - [1124, 3954.01] + - [1198, 3954.01] - - [200, 2000, 1, 2000] - - [1086, 6230.63] + - [1160, 6230.63] - - [100, 100, 1, 2000] - - [948, 827.915] + - [1022, 827.915] - - [500, 2048, 1, 2000] - - [1049, 8388.04] + - [1123, 8388.04] - - [64, 2048, 1, 2048] - - [1094, 3406.64] + - [1168, 3406.64] - - [16, 2000, 1, 1024] - - [954, 1024.1] + - [1028, 1024.1] - - [512, 2048, 1, 1024] - - [1027, 8061.22] + - [1101, 8061.22] - - [10, 500, 1, 500] - - [958, 284.191] + - [1032, 284.191] - - [200, 1024, 1, 2048] - - [1122, 4886.29] + - [1196, 4886.29] - - [10, 2000, 1, 2000] - - [948, 1449.38] + - [1022, 1449.38] - - [8, 2000, 1, 500] - - [997, 719.524] + - [1071, 719.524] - - [2, 100, 1, 2048] - - [1012, 19.945] + - [1086, 19.945] - - [32, 100, 1, 2048] - - [1012, 323.894] + - [1086, 323.894] - - [512, 512, 1, 10] - - [1069, 420.203] + - [1143, 420.203] - - [512, 500, 1, 10] - - [1074, 376.571] + - [1148, 376.571] - - [16, 100, 1, 1024] - - [958, 129.72] + - [1032, 129.72] - - [2, 500, 1, 10] - - [922, 2.21864] + - [996, 2.21864] - - [200, 512, 1, 10] - - [924, 188.335] + - [998, 188.335] - - [512, 1024, 1, 100] - - [1024, 3877.97] + - [1098, 3877.97] - - [16, 2000, 1, 2000] - - [948, 2222.32] + - [1022, 2222.32] - - [500, 500, 1, 1024] - - [1044, 6130.37] + - [1118, 6130.37] - - [500, 100, 1, 2048] - - [1099, 2949.41] + - [1173, 2949.41] - - [256, 1024, 1, 512] - - [1063, 5886.84] + - [1137, 5886.84] - - [256, 500, 1, 512] - - [1041, 4380.85] + - [1115, 4380.85] - - [16, 1024, 1, 2000] - - [1012, 1208.36] + - [1086, 1208.36] - - [200, 500, 1, 2048] - - [1124, 3855.52] + - [1198, 3855.52] - - [256, 2000, 1, 10] - - [1026, 727.373] + - [1100, 727.373] - - [10, 2048, 1, 2048] - - [979, 823.158] + - [1053, 823.158] - - [512, 2000, 1, 100] - - [1028, 5120.1] + - [1102, 5120.1] - - [10, 1024, 1, 1024] - - [955, 553.146] + - [1029, 553.146] - - [512, 2000, 1, 2048] - - [1056, 7563.4] + - [1130, 7563.4] - - [500, 1024, 1, 500] - - [1080, 6570.94] + - [1154, 6570.94] - - [500, 100, 1, 512] - - [1099, 2038.32] + - [1173, 2038.32] - - [256, 2000, 1, 100] - - [1048, 3764.81] + - [1122, 3764.81] - - [512, 1024, 1, 2048] - - [1092, 7286.62] + - [1166, 7286.62] - - [32, 512, 1, 500] - - [948, 898.346] + - [1022, 898.346] - - [100, 2000, 1, 10] - - [940, 333.433] + - [1014, 333.433] - - [100, 500, 1, 512] - - [1118, 2176.97] + - [1192, 2176.97] - - [8, 2000, 1, 512] - - [997, 602.453] + - [1071, 602.453] - - [100, 2048, 1, 2048] - - [1104, 3694.87] + - [1178, 3694.87] - - [128, 1024, 1, 2048] - - [1123, 4168.35] + - [1197, 4168.35] - - [8, 500, 1, 2000] - - [1022, 352.213] + - [1096, 352.213] - - [100, 2000, 1, 500] - - [1072, 4045.41] + - [1146, 4045.41] - - [100, 2048, 1, 100] - - [1072, 2081.4] + - [1146, 2081.4] - - [4, 100, 1, 1024] - - [948, 33.1323] + - [1022, 33.1323] - - [500, 2048, 1, 2048] - - [1056, 7765.03] + - [1130, 7765.03] - - [2, 2000, 1, 2048] - - [967, 166.334] + - [1041, 166.334] - - [200, 2048, 1, 10] - - [941, 609.624] + - [1015, 609.624] - - [2, 500, 1, 1024] - - [1012, 75.3941] + - [1086, 75.3941] - - [100, 500, 1, 1024] - - [1008, 1975.41] + - [1082, 1975.41] - - [16, 2048, 1, 500] - - [948, 1473.48] + - [1022, 1473.48] - - [100, 1024, 1, 10] - - [1108, 185.607] + - [1182, 185.607] - - [8, 2048, 1, 1024] - - [1004, 543.404] + - [1078, 543.404] - - [2, 2000, 1, 500] - - [948, 179.956] + - [1022, 179.956] - - [32, 100, 1, 1024] - - [948, 267.812] + - [1022, 267.812] - - [500, 2000, 1, 512] - - [1078, 7087.59] + - [1152, 7087.59] - - [64, 100, 1, 2000] - - [958, 615.485] + - [1032, 615.485] - - [100, 1024, 1, 2000] - - [1121, 4224.52] + - [1195, 4224.52] - - [64, 500, 1, 10] - - [923, 63.5921] + - [997, 63.5921] - - [32, 2048, 1, 100] - - [944, 941.709] + - [1018, 941.709] - - [64, 500, 1, 512] - - [948, 1575.48] + - [1022, 1575.48] - - [10, 100, 1, 1024] - - [958, 82.6806] + - [1032, 82.6806] - - [16, 512, 1, 100] - - [947, 148.506] + - [1021, 148.506] - - [4, 100, 1, 2000] - - [1021, 43.9597] + - [1095, 43.9597] - - [2, 512, 1, 1024] - - [1012, 74.152] + - [1086, 74.152] - - [64, 512, 1, 1024] - - [1013, 1571.0] + - [1087, 1571.0] - - [10, 2048, 1, 500] - - [948, 920.963] + - [1022, 920.963] - - [4, 2000, 1, 2048] - - [967, 326.215] + - [1041, 326.215] - - [512, 100, 1, 2048] - - [1102, 3084.15] + - [1176, 3084.15] - - [32, 100, 1, 2000] - - [948, 343.448] + - [1022, 343.448] - - [256, 512, 1, 500] - - [1041, 4311.68] + - [1115, 4311.68] - - [100, 2000, 1, 100] - - [1072, 2016.23] + - [1146, 2016.23] - - [8, 2000, 1, 1024] - - [961, 544.781] + - [1035, 544.781] - - [4, 512, 1, 500] - - [948, 118.619] + - [1022, 118.619] - - [128, 1024, 1, 10] - - [1111, 244.637] + - [1185, 244.637] - - [4, 500, 1, 1024] - - [948, 144.733] + - [1022, 144.733] - - [32, 2048, 1, 512] - - [951, 2140.05] + - [1025, 2140.05] - - [32, 100, 1, 10] - - [926, 7.11754] + - [1000, 7.11754] - - [100, 2048, 1, 10] - - [1115, 341.433] + - [1189, 341.433] - - [512, 500, 1, 100] - - [1076, 2461.64] + - [1150, 2461.64] - - [128, 2000, 1, 1024] - - [1060, 4174.37] + - [1134, 4174.37] - - [200, 1024, 1, 500] - - [1072, 4295.4] + - [1146, 4295.4] - - [32, 2048, 1, 1024] - - [975, 1667.82] + - [1049, 1667.82] - - [10, 1024, 1, 2048] - - [966, 555.49] + - [1040, 555.49] - - [8, 500, 1, 100] - - [947, 71.5286] + - [1021, 71.5286] - - [32, 2048, 1, 500] - - [951, 2528.5] + - [1025, 2528.5] - - [200, 100, 1, 1024] - - [960, 1071.23] + - [1034, 1071.23] - - [16, 100, 1, 100] - - [937, 28.6714] + - [1011, 28.6714] - - [8, 1024, 1, 2000] - - [1021, 654.413] + - [1095, 654.413] - - [4, 512, 1, 100] - - [947, 36.6714] + - [1021, 36.6714] - - [16, 500, 1, 100] - - [947, 142.957] + - [1021, 142.957] - - [8, 1024, 1, 2048] - - [973, 441.606] + - [1047, 441.606] - - [16, 1024, 1, 2048] - - [974, 886.845] + - [1048, 886.845] - - [10, 2048, 1, 1024] - - [952, 639.476] + - [1026, 639.476] - - [64, 512, 1, 100] - - [947, 518.581] + - [1021, 518.581] - - [2, 100, 1, 500] - - [948, 9.71538] + - [1022, 9.71538] - - [2, 500, 1, 512] - - [954, 48.2203] + - [1028, 48.2203] - - [256, 512, 1, 2000] - - [1057, 6450.49] + - [1131, 6450.49] - - [128, 500, 1, 1024] - - [951, 2497.66] + - [1025, 2497.66] - - [10, 100, 1, 10] - - [988, 2.33214] + - [1062, 2.33214] - - [8, 2048, 1, 2048] - - [938, 643.398] + - [1012, 643.398] - - [16, 2048, 1, 2048] - - [978, 1338.0] + - [1052, 1338.0] - - [64, 1024, 1, 10] - - [941, 132.229] + - [1015, 132.229] - - [500, 100, 1, 500] - - [1099, 1941.09] + - [1173, 1941.09] - - [256, 1024, 1, 2000] - - [1095, 7629.44] + - [1169, 7629.44] - - [200, 512, 1, 500] - - [1084, 3232.42] + - [1158, 3232.42] - - [8, 2000, 1, 10] - - [985, 32.3581] + - [1059, 32.3581] - - [64, 2000, 1, 512] - - [1083, 3225.3] + - [1157, 3225.3] - - [2, 512, 1, 100] - - [927, 16.7234] + - [1001, 16.7234] - - [4, 2000, 1, 2000] - - [948, 586.61] + - [1022, 586.61] - - [200, 1024, 1, 100] - - [1072, 2133.43] + - [1146, 2133.43] - - [16, 100, 1, 500] - - [1012, 92.6926] + - [1086, 92.6926] - - [128, 100, 1, 500] - - [1008, 526.416] + - [1082, 526.416] - - [500, 1024, 1, 1024] - - [1042, 7201.86] + - [1116, 7201.86] - - [200, 1024, 1, 1024] - - [1094, 4519.82] + - [1168, 4519.82] - - [8, 2048, 1, 512] - - [958, 624.252] + - [1032, 624.252] - - [200, 2000, 1, 500] - - [1048, 5186.82] + - [1122, 5186.82] - - [512, 100, 1, 1024] - - [1099, 2742.19] + - [1173, 2742.19] - - [16, 100, 1, 2000] - - [958, 168.876] + - [1032, 168.876] - - [500, 512, 1, 2000] - - [1095, 7289.39] + - [1169, 7289.39] - - [8, 2000, 1, 2048] - - [969, 668.289] + - [1043, 668.289] - - [256, 2048, 1, 100] - - [1030, 3924.41] + - [1104, 3924.41] - - [32, 2048, 1, 2000] - - [962, 3882.56] + - [1036, 3882.56] - - [200, 500, 1, 512] - - [1087, 3368.52] + - [1161, 3368.52] - - [10, 512, 1, 100] - - [947, 91.5286] + - [1021, 91.5286] - - [16, 2000, 1, 10] - - [925, 61.6385] + - [999, 61.6385] - - [8, 512, 1, 100] - - [947, 72.2127] + - [1021, 72.2127] - - [256, 512, 1, 512] - - [1052, 4584.04] + - [1126, 4584.04] - - [500, 2000, 1, 1024] - - [1027, 7569.59] + - [1101, 7569.59] - - [512, 512, 1, 500] - - [1043, 5708.81] + - [1117, 5708.81] - - [256, 2048, 1, 1024] - - [1067, 5923.21] + - [1141, 5923.21] - - [8, 2048, 1, 2000] - - [948, 1153.9] + - [1022, 1153.9] - - [100, 512, 1, 2048] - - [1014, 2383.23] + - [1088, 2383.23] - - [100, 1024, 1, 512] - - [1099, 3343.77] + - [1173, 3343.77] - - [128, 100, 1, 2000] - - [1117, 1084.85] + - [1191, 1084.85] - - [4, 2048, 1, 2048] - - [966, 332.454] + - [1040, 332.454] - - [2, 1024, 1, 2000] - - [977, 161.106] + - [1051, 161.106] - - [100, 512, 1, 512] - - [951, 2184.63] + - [1025, 2184.63] - - [128, 1024, 1, 1024] - - [1094, 3848.09] + - [1168, 3848.09] - - [200, 2048, 1, 1024] - - [1029, 4547.26] + - [1103, 4547.26] - - [32, 1024, 1, 2000] - - [958, 2416.62] + - [1032, 2416.62] - - [128, 500, 1, 100] - - [953, 919.64] + - [1027, 919.64] - - [200, 512, 1, 2000] - - [1121, 4238.51] + - [1195, 4238.51] - - [10, 2048, 1, 2000] - - [958, 1454.65] + - [1032, 1454.65] - - [256, 1024, 1, 500] - - [1055, 5669.3] + - [1129, 5669.3] - - [100, 100, 1, 100] - - [947, 171.333] + - [1021, 171.333] - - [8, 512, 1, 1024] - - [1016, 286.596] + - [1090, 286.596] - - [200, 1024, 1, 512] - - [1072, 4354.65] + - [1146, 4354.65] - - [256, 500, 1, 500] - - [1057, 4020.2] + - [1131, 4020.2] - - [200, 100, 1, 500] - - [1121, 702.347] + - [1195, 702.347] - - [2, 1024, 1, 2048] - - [967, 112.85] + - [1041, 112.85] - - [256, 500, 1, 2048] - - [1057, 5041.33] + - [1131, 5041.33] - - [512, 2048, 1, 500] - - [1050, 7710.22] + - [1124, 7710.22] - - [512, 100, 1, 2000] - - [1099, 3099.37] + - [1173, 3099.37] - - [512, 500, 1, 1024] - - [1058, 6463.22] + - [1132, 6463.22] - - [16, 512, 1, 2000] - - [974, 721.227] + - [1048, 721.227] - - [64, 500, 1, 1024] - - [1013, 1528.46] + - [1087, 1528.46] - - [512, 2000, 1, 10] - - [1034, 1174.41] + - [1108, 1174.41] - - [256, 512, 1, 1024] - - [1052, 4978.5] + - [1126, 4978.5] - - [10, 512, 1, 1024] - - [1012, 370.36] + - [1086, 370.36] - - [512, 100, 1, 100] - - [1035, 659.894] + - [1109, 659.894] - - [8, 2000, 1, 100] - - [947, 256.51] + - [1021, 256.51] - - [128, 2048, 1, 1024] - - [1060, 4173.54] + - [1134, 4173.54] - - [2, 2000, 1, 2000] - - [948, 250.727] + - [1022, 250.727] - - [16, 2048, 1, 1024] - - [995, 1046.06] + - [1069, 1046.06] - - [500, 512, 1, 500] - - [1040, 5517.34] + - [1114, 5517.34] - - [8, 100, 1, 1024] - - [1013, 64.1] + - [1087, 64.1] - - [10, 100, 1, 100] - - [937, 17.9571] + - [1011, 17.9571] - - [200, 500, 1, 500] - - [1087, 3140.8] + - [1161, 3140.8] - - [10, 500, 1, 2000] - - [974, 444.94] + - [1048, 444.94] - - [500, 100, 1, 2000] - - [1102, 2969.22] + - [1176, 2969.22] - - [100, 512, 1, 2000] - - [1014, 2776.67] + - [1088, 2776.67] - - [500, 1024, 1, 2000] - - [1093, 8020.15] + - [1167, 8020.15] - - [32, 2000, 1, 2000] - - [954, 3827.85] + - [1028, 3827.85] - - [64, 1024, 1, 512] - - [1118, 2573.29] + - [1192, 2573.29] - - [64, 2000, 1, 2000] - - [1087, 5797.2] + - [1161, 5797.2] - - [32, 500, 1, 100] - - [947, 266.767] + - [1021, 266.767] - - [128, 2000, 1, 2048] - - [1103, 4548.05] + - [1177, 4548.05] - - [10, 100, 1, 2048] - - [1012, 98.5615] + - [1086, 98.5615] - - [32, 2048, 1, 2048] - - [975, 2213.45] + - [1049, 2213.45] - - [64, 100, 1, 100] - - [948, 96.4855] + - [1022, 96.4855] - - [2, 1024, 1, 100] - - [998, 34.6946] + - [1072, 34.6946] - - [256, 1024, 1, 10] - - [1068, 425.658] + - [1142, 425.658] - - [256, 1024, 1, 1024] - - [1061, 5482.85] + - [1135, 5482.85] - - [64, 500, 1, 2000] - - [948, 2056.66] + - [1022, 2056.66] - - [512, 2000, 1, 512] - - [1046, 7550.33] + - [1120, 7550.33] - - [8, 512, 1, 512] - - [955, 232.086] + - [1029, 232.086] - - [8, 512, 1, 2048] - - [948, 290.564] + - [1022, 290.564] - - [100, 100, 1, 1024] - - [1118, 624.49] + - [1192, 624.49] - - [2, 2048, 1, 10] - - [991, 8.92759] + - [1065, 8.92759] - - [4, 2048, 1, 512] - - [997, 312.176] + - [1071, 312.176] - - [4, 2048, 1, 10] - - [990, 18.0649] + - [1064, 18.0649] - - [8, 100, 1, 2000] - - [967, 85.9369] + - [1041, 85.9369] - - [2, 1024, 1, 1024] - - [964, 101.314] + - [1038, 101.314] - - [16, 2048, 1, 100] - - [948, 518.581] + - [1022, 518.581] - - [16, 512, 1, 512] - - [958, 456.003] + - [1032, 456.003] - - [32, 500, 1, 512] - - [955, 906.295] + - [1029, 906.295] - - [500, 2000, 1, 2000] - - [1050, 8143.42] + - [1124, 8143.42] - - [500, 1024, 1, 10] - - [1031, 680.951] + - [1105, 680.951] - - [32, 500, 1, 1024] - - [1007, 1008.97] + - [1081, 1008.97] - - [32, 500, 1, 10] - - [943, 33.4333] + - [1017, 33.4333] - - [500, 500, 1, 10] - - [1072, 367.747] + - [1146, 367.747] - - [4, 2000, 1, 500] - - [958, 370.47] + - [1032, 370.47] - - [10, 2000, 1, 500] - - [948, 899.381] + - [1022, 899.381] - - [32, 2000, 1, 512] - - [960, 2089.9] + - [1034, 2089.9] - - [256, 500, 1, 100] - - [1073, 1495.43] + - [1147, 1495.43] - - [256, 2048, 1, 10] - - [1031, 789.69] + - [1105, 789.69] - - [4, 1024, 1, 500] - - [948, 222.709] + - [1022, 222.709] - - [256, 512, 1, 2048] - - [1057, 5292.6] + - [1131, 5292.6] - - [2, 2000, 1, 1024] - - [995, 137.365] + - [1069, 137.365] - - [256, 100, 1, 512] - - [1099, 1085.13] + - [1173, 1085.13] - - [8, 1024, 1, 500] - - [948, 441.479] + - [1022, 441.479] - - [256, 2048, 1, 500] - - [1078, 7031.86] + - [1152, 7031.86] - - [256, 2048, 1, 2048] - - [1041, 6771.93] + - [1115, 6771.93] - - [2, 2000, 1, 512] - - [1002, 159.106] + - [1076, 159.106] - - [256, 2000, 1, 512] - - [1045, 6527.59] + - [1119, 6527.59] - - [4, 1024, 1, 100] - - [994, 70.237] + - [1068, 70.237] - - [512, 1024, 1, 2000] - - [1079, 8295.8] + - [1153, 8295.8] - - [100, 500, 1, 500] - - [951, 2016.23] + - [1025, 2016.23] - - [4, 2048, 1, 1024] - - [999, 285.039] + - [1073, 285.039] - - [2, 1024, 1, 500] - - [948, 109.502] + - [1022, 109.502] - - [64, 100, 1, 500] - - [948, 296.396] + - [1022, 296.396] - - [256, 2000, 1, 2000] - - [1056, 8152.97] + - [1130, 8152.97] - - [2, 512, 1, 500] - - [954, 44.8552] + - [1028, 44.8552] - - [8, 2048, 1, 500] - - [948, 736.791] + - [1022, 736.791] - - [10, 1024, 1, 500] - - [948, 547.109] + - [1022, 547.109] - - [4, 2048, 1, 2000] - - [958, 604.23] + - [1032, 604.23] - - [200, 1024, 1, 2000] - - [1125, 5400.94] + - [1199, 5400.94] - - [128, 500, 1, 512] - - [1118, 2730.77] + - [1192, 2730.77] - - [10, 500, 1, 2048] - - [1012, 359.651] + - [1086, 359.651] - - [256, 2048, 1, 2000] - - [1056, 8375.31] + - [1130, 8375.31] - - [8, 2000, 1, 2000] - - [958, 1146.23] + - [1032, 1146.23] - - [100, 2048, 1, 512] - - [1081, 3936.2] + - [1155, 3936.2] - - [512, 500, 1, 2048] - - [1057, 6756.39] + - [1131, 6756.39] - - [200, 2048, 1, 100] - - [1048, 3180.22] + - [1122, 3180.22] - - [128, 512, 1, 512] - - [951, 2872.91] + - [1025, 2872.91] - - [200, 2000, 1, 2048] - - [1097, 4818.92] + - [1171, 4818.92] - - [4, 2000, 1, 1024] - - [995, 275.369] + - [1069, 275.369] - - [64, 512, 1, 10] - - [1110, 69.5237] + - [1184, 69.5237] - - [32, 500, 1, 2000] - - [977, 1246.21] + - [1051, 1246.21] - - [128, 2048, 1, 2000] - - [1090, 7233.65] + - [1164, 7233.65] - - [100, 100, 1, 2048] - - [948, 790.223] + - [1022, 790.223] - - [500, 2048, 1, 512] - - [1078, 7249.66] + - [1152, 7249.66] - - [200, 100, 1, 512] - - [954, 748.638] + - [1028, 748.638] - - [32, 2000, 1, 100] - - [949, 930.333] + - [1023, 930.333] - - [500, 512, 1, 2048] - - [1100, 6640.02] + - [1174, 6640.02] - - [500, 2000, 1, 500] - - [1080, 7078.24] + - [1154, 7078.24] - - [200, 100, 1, 2048] - - [958, 1387.63] + - [1032, 1387.63] - - [2, 2048, 1, 100] - - [992, 64.9101] + - [1066, 64.9101] - - [8, 100, 1, 10] - - [933, 1.85439] + - [1007, 1.85439] - - [200, 2048, 1, 2048] - - [1097, 5022.02] + - [1171, 5022.02] - - [200, 2048, 1, 500] - - [1048, 5355.75] + - [1122, 5355.75] - - [100, 100, 1, 500] - - [1118, 416.767] + - [1192, 416.767] - - [8, 2048, 1, 10] - - [989, 34.8119] + - [1063, 34.8119] - - [100, 500, 1, 10] - - [929, 93.3836] + - [1003, 93.3836] - - [200, 500, 1, 2000] - - [1121, 4152.92] + - [1195, 4152.92] - - [512, 2000, 1, 500] - - [1050, 7485.48] + - [1124, 7485.48] - - [10, 500, 1, 1024] - - [1016, 363.736] + - [1090, 363.736] - - [256, 100, 1, 10] - - [1065, 41.1256] + - [1139, 41.1256] - - [500, 512, 1, 1024] - - [1044, 6362.82] + - [1118, 6362.82] - - [200, 2048, 1, 2000] - - [1086, 6321.09] + - [1160, 6321.09] - - [100, 1024, 1, 100] - - [1085, 1306.22] + - [1159, 1306.22] - - [500, 1024, 1, 100] - - [1024, 3699.52] + - [1098, 3699.52] - - [10, 512, 1, 2048] - - [948, 361.18] + - [1022, 361.18] - - [2, 1024, 1, 512] - - [997, 105.803] + - [1071, 105.803] - - [4, 500, 1, 2048] - - [1020, 143.517] + - [1094, 143.517] - - [100, 512, 1, 100] - - [953, 744.286] + - [1027, 744.286] - - [16, 500, 1, 512] - - [948, 453.197] + - [1022, 453.197] - - [10, 1024, 1, 100] - - [946, 166.334] + - [1020, 166.334] - - [8, 1024, 1, 100] - - [994, 140.374] + - [1068, 140.374] - - [64, 2000, 1, 500] - - [1089, 3940.99] + - [1163, 3940.99] - - [64, 1024, 1, 2000] - - [954, 3531.13] + - [1028, 3531.13] - - [10, 100, 1, 512] - - [948, 61.6385] + - [1022, 61.6385] - - [4, 500, 1, 2000] - - [974, 173.11] + - [1048, 173.11] - - [512, 1024, 1, 10] - - [1025, 736.46] + - [1099, 736.46] - - [128, 2048, 1, 2048] - - [1088, 4596.6] + - [1162, 4596.6] - - [4, 100, 1, 100] - - [937, 7.24286] + - [1011, 7.24286] - - [32, 1024, 1, 512] - - [997, 1519.78] + - [1071, 1519.78] - - [8, 512, 1, 2000] - - [1022, 356.894] + - [1096, 356.894] - - [100, 100, 1, 512] - - [962, 426.767] + - [1036, 426.767] - - [2, 2048, 1, 2048] - - [971, 170.878] + - [1045, 170.878] - - [2, 512, 1, 2000] - - [974, 90.8801] + - [1048, 90.8801] - - [16, 500, 1, 10] - - [947, 18.2818] + - [1021, 18.2818] - - [10, 500, 1, 100] - - [947, 88.1282] + - [1021, 88.1282] - - [4, 100, 1, 500] - - [1012, 23.6849] + - [1086, 23.6849] - - [512, 1024, 1, 1024] - - [1064, 7431.87] + - [1138, 7431.87] - - [64, 500, 1, 100] - - [957, 506.429] + - [1031, 506.429] - - [128, 2000, 1, 10] - - [1115, 432.532] + - [1189, 432.532] - - [10, 2000, 1, 2048] - - [978, 806.399] + - [1052, 806.399] - - [2, 100, 1, 100] - - [935, 3.225] + - [1009, 3.225] - - [10, 512, 1, 2000] - - [967, 462.194] + - [1041, 462.194] - - [8, 500, 1, 500] - - [948, 231.581] + - [1022, 231.581] - - [4, 500, 1, 512] - - [948, 118.619] + - [1022, 118.619] - - [10, 500, 1, 10] - - [942, 11.0649] + - [1016, 11.0649] - - [64, 512, 1, 2000] - - [948, 2116.9] + - [1022, 2116.9] - - [500, 512, 1, 10] - - [1069, 395.162] + - [1143, 395.162] - - [200, 512, 1, 512] - - [1087, 3449.36] + - [1161, 3449.36] - - [512, 500, 1, 500] - - [1043, 5536.43] + - [1117, 5536.43] - - [32, 512, 1, 2000] - - [958, 1264.3] + - [1032, 1264.3] - - [128, 500, 1, 2048] - - [1014, 3006.34] + - [1088, 3006.34] - - [500, 2048, 1, 10] - - [1039, 1049.28] + - [1113, 1049.28] - - [512, 512, 1, 100] - - [1076, 2664.16] + - [1150, 2664.16] - - [200, 2000, 1, 512] - - [1084, 5192.8] + - [1158, 5192.8] - - [500, 500, 1, 512] - - [1040, 5673.86] + - [1114, 5673.86] - - [128, 2048, 1, 500] - - [1072, 5251.38] + - [1146, 5251.38] - - [4, 512, 1, 512] - - [948, 123.753] + - [1022, 123.753] - - [16, 2048, 1, 2000] - - [964, 2294.78] + - [1038, 2294.78] - - [16, 500, 1, 1024] - - [948, 562.737] + - [1022, 562.737] - - [256, 2000, 1, 500] - - [1078, 6639.1] + - [1152, 6639.1] - - [10, 1024, 1, 10] - - [928, 21.0836] + - [1002, 21.0836] - - [16, 500, 1, 500] - - [948, 446.529] + - [1022, 446.529] - - [10, 2048, 1, 512] - - [946, 784.962] + - [1020, 784.962] - - [200, 500, 1, 10] - - [921, 176.156] + - [995, 176.156] - - [256, 2048, 1, 512] - - [1075, 6540.93] + - [1149, 6540.93] - - [256, 2000, 1, 2048] - - [1052, 6670.43] + - [1126, 6670.43] - - [500, 2048, 1, 500] - - [1080, 7264.57] + - [1154, 7264.57] - - [500, 100, 1, 1024] - - [1102, 2700.52] + - [1176, 2700.52] - - [16, 100, 1, 512] - - [1012, 96.7038] + - [1086, 96.7038] - - [64, 512, 1, 2048] - - [1013, 1868.39] + - [1087, 1868.39] - - [32, 1024, 1, 10] - - [924, 69.5237] + - [998, 69.5237] - - [16, 2048, 1, 512] - - [997, 1226.5] + - [1071, 1226.5] - - [8, 1024, 1, 512] - - [997, 416.202] + - [1071, 416.202] - - [4, 1024, 1, 2048] - - [1019, 223.201] + - [1093, 223.201] - - [100, 2048, 1, 2000] - - [1092, 5614.14] + - [1166, 5614.14] - - [512, 512, 1, 2048] - - [1057, 6868.97] + - [1131, 6868.97] - - [256, 2000, 1, 1024] - - [1048, 5758.98] + - [1122, 5758.98] - - [64, 512, 1, 512] - - [1117, 1651.4] + - [1191, 1651.4] - - [200, 1024, 1, 10] - - [931, 341.433] + - [1005, 341.433] - - [128, 500, 1, 500] - - [960, 2580.75] + - [1034, 2580.75] - - [100, 512, 1, 1024] - - [951, 2041.72] + - [1025, 2041.72] - - [16, 1024, 1, 500] - - [948, 867.897] + - [1022, 867.897] - - [128, 100, 1, 2048] - - [1118, 1011.46] + - [1192, 1011.46] - - [100, 512, 1, 500] - - [951, 2051.38] + - [1025, 2051.38] - - [8, 1024, 1, 1024] - - [964, 424.625] + - [1038, 424.625] - - [2, 2000, 1, 10] - - [990, 8.57458] + - [1064, 8.57458] - - [4, 500, 1, 10] - - [987, 4.56429] + - [1061, 4.56429] - - [500, 2000, 1, 2048] - - [1064, 7444.12] + - [1138, 7444.12] - - [4, 2000, 1, 100] - - [1000, 128.305] + - [1074, 128.305] - - [512, 2000, 1, 2000] - - [1050, 8454.53] + - [1124, 8454.53] - - [128, 500, 1, 10] - - [1109, 117.747] + - [1183, 117.747] - - [32, 1024, 1, 100] - - [957, 512.1] + - [1031, 512.1] - - [8, 500, 1, 2048] - - [972, 286.935] + - [1046, 286.935] - - [16, 1024, 1, 1024] - - [936, 881.256] + - [1010, 881.256] - - [200, 100, 1, 10] - - [1108, 40.4226] + - [1182, 40.4226] - - [512, 100, 1, 500] - - [1102, 1987.68] + - [1176, 1987.68] - - [512, 2048, 1, 2048] - - [1059, 8063.65] + - [1133, 8063.65] - - [16, 2000, 1, 512] - - [958, 1204.81] + - [1032, 1204.81] - - [64, 2048, 1, 1024] - - [956, 2853.37] + - [1030, 2853.37] - - [32, 2048, 1, 10] - - [930, 130.132] + - [1004, 130.132] - - [10, 2048, 1, 10] - - [932, 39.4846] + - [1006, 39.4846] - - [4, 2000, 1, 512] - - [948, 316.149] + - [1022, 316.149] - - [4, 500, 1, 100] - - [947, 35.8143] + - [1021, 35.8143] - - [8, 100, 1, 2048] - - [967, 84.7281] + - [1041, 84.7281] - - [512, 2048, 1, 10] - - [1047, 1225.07] + - [1121, 1225.07] - - [512, 100, 1, 10] - - [1036, 90.2408] + - [1110, 90.2408] - - [4, 512, 1, 1024] - - [948, 143.348] + - [1022, 143.348] - - [16, 2048, 1, 10] - - [981, 65.1159] + - [1055, 65.1159] - - [500, 2000, 1, 100] - - [1032, 4717.08] + - [1106, 4717.08] - - [32, 1024, 1, 2048] - - [975, 1582.86] + - [1049, 1582.86] - - [100, 2000, 1, 2000] - - [1092, 5512.78] + - [1166, 5512.78] - - [128, 100, 1, 512] - - [1118, 561.196] + - [1192, 561.196] - - [500, 500, 1, 100] - - [1072, 2460.73] + - [1146, 2460.73] - - [32, 2000, 1, 10] - - [924, 119.503] + - [998, 119.503] - - [128, 2048, 1, 100] - - [1072, 2708.2] + - [1146, 2708.2] - - [10, 2000, 1, 100] - - [947, 316.556] + - [1021, 316.556] - - [2, 2048, 1, 500] - - [958, 191.145] + - [1032, 191.145] - - [32, 1024, 1, 500] - - [958, 1563.46] + - [1032, 1563.46] - - [4, 1024, 1, 10] - - [987, 9.24286] + - [1061, 9.24286] - - [100, 512, 1, 10] - - [1113, 97.0697] + - [1187, 97.0697] - - [8, 100, 1, 100] - - [963, 14.3857] + - [1037, 14.3857] - - [128, 512, 1, 500] - - [951, 2677.22] + - [1025, 2677.22] - - [16, 100, 1, 2048] - - [974, 161.997] + - [1048, 161.997] - - [2, 1024, 1, 10] - - [987, 4.59123] + - [1061, 4.59123] - - [4, 100, 1, 2048] - - [967, 41.8959] + - [1041, 41.8959] - - [4, 512, 1, 2000] - - [967, 180.382] + - [1041, 180.382] - - [4096, 64, 1, 2048] - - [1167, 7247.28] + - [1241, 7247.28] - - [1024, 10080, 1, 1024] - - [1155, 9833.47] + - [1229, 9833.47] - - [1024, 1131, 1, 1024] - - [1133, 7551.95] + - [1207, 7551.95] - - [36548, 1216, 1, 1024] - - [1145, 10351.6] + - [1219, 10351.6] - - [1024, 29, 1, 1024] - - [1177, 1697.01] + - [1251, 1697.01] - - [1024, 2592, 1, 1024] - - [1146, 8424.11] + - [1220, 8424.11] - - [1024, 1568, 1, 1024] - - [1157, 7511.86] + - [1231, 7511.86] - - [4096, 91, 1, 2048] - - [1126, 5599.91] + - [1200, 5599.91] - - [1024, 4445, 1, 1024] - - [1144, 9261.22] + - [1218, 9261.22] - - [1024, 6272, 1, 1024] - - [1139, 9439.61] + - [1213, 9439.61] - - [36548, 3584, 1, 1024] - - [1138, 10393.8] + - [1212, 10393.8] - - [1024, 1827, 1, 1024] - - [1157, 8714.42] + - [1231, 8714.42] - - [1024, 3220, 1, 1024] - - [1137, 8861.2] + - [1211, 8861.2] - - [1024, 1856, 1, 1024] - - [1154, 8827.05] + - [1228, 8827.05] - - [1024, 1760, 1, 1024] - - [1154, 8334.2] + - [1228, 8334.2] - - [1024, 1600, 1, 1024] - - [1154, 7615.07] + - [1228, 7615.07] - - [1024, 1, 1, 21] - - [1158, 0.1] + - [1232, 0.1] - - [36548, 4235, 1, 1024] - - [1138, 10276.8] + - [1212, 10276.8] - - [1024, 49, 1, 1024] - - [1173, 2643.12] + - [1247, 2643.12] - - [1024, 1984, 1, 1024] - - [1157, 9449.52] + - [1231, 9449.52] - - [1024, 14720, 1, 1024] - - [1144, 10033.3] + - [1218, 10033.3] - - [1024, 1152, 1, 1024] - - [1127, 7523.54] + - [1201, 7523.54] - - [36548, 14976, 1, 1024] - - [1145, 10421.7] + - [1219, 10421.7] - - [36548, 1152, 1, 1024] - - [1145, 10258.1] + - [1219, 10258.1] - - [4096, 86, 1, 3072] - - [1126, 5308.85] + - [1200, 5308.85] - - [1024, 3392, 1, 1024] - - [1139, 9176.54] + - [1213, 9176.54] - - [1024, 1408, 1, 1024] - - [1139, 8958.83] + - [1213, 8958.83] - - [1024, 2080, 1, 1024] - - [1130, 8396.49] + - [1204, 8396.49] - - [1024, 1824, 1, 1024] - - [1148, 8671.71] + - [1222, 8671.71] - - [36548, 2432, 1, 1024] - - [1138, 10392.6] + - [1212, 10392.6] - - [4096, 29, 1, 2048] - - [1159, 4325.66] + - [1233, 4325.66] - - [1024, 1102, 1, 1024] - - [1133, 7204.18] + - [1207, 7204.18] - - [4096, 49, 1, 2048] - - [1165, 5609.29] + - [1239, 5609.29] - - [36548, 1827, 1, 1024] - - [1145, 10183.2] + - [1219, 10183.2] - - [4096, 25, 1, 2048] - - [1160, 3788.31] + - [1234, 3788.31] - - [1024, 10176, 1, 1024] - - [1155, 9941.18] + - [1229, 9941.18] - - [1024, 774, 1, 1024] - - [1140, 7079.67] + - [1214, 7079.67] - - [1024, 1952, 1, 1024] - - [1157, 9300.49] + - [1231, 9300.49] - - [4096, 128, 1, 2048] - - [1127, 8274.96] + - [1201, 8274.96] - - [1024, 17024, 1, 1024] - - [1137, 9960.72] + - [1211, 9960.72] - - [1024, 1472, 1, 1024] - - [1146, 9343.37] + - [1220, 9343.37] - - [36548, 4459, 1, 1024] - - [1138, 10358.1] + - [1212, 10358.1] - - [4096, 91, 1, 3072] - - [1132, 5509.39] + - [1206, 5509.39] - - [1024, 3712, 1, 1024] - - [1146, 9048.66] + - [1220, 9048.66] - - [4096, 64, 1, 3072] - - [1179, 7489.93] + - [1253, 7489.93] - - [4096, 29, 1, 3072] - - [1159, 4511.78] + - [1233, 4511.78] - - [4096, 128, 1, 3072] - - [1126, 8423.83] + - [1200, 8423.83] - - [36548, 12928, 1, 1024] - - [1145, 10426.1] + - [1219, 10426.1] - - [1024, 1632, 1, 1024] - - [1127, 7761.73] + - [1201, 7761.73] - - [1024, 1696, 1, 1024] - - [1152, 8107.29] + - [1226, 8107.29] - - [4096, 24, 1, 2048] - - [1159, 3663.25] + - [1233, 3663.25] - - [4096, 63, 1, 3072] - - [1168, 7175.37] + - [1242, 7175.37] - - [4096, 96, 1, 2048] - - [1127, 5866.28] + - [1201, 5866.28] - - [36548, 1764, 1, 1024] - - [1138, 10128.5] + - [1212, 10128.5] - - [4096, 32, 1, 2048] - - [1163, 4540.62] + - [1237, 4540.62] - - [1024, 35, 1, 1024] - - [1171, 1911.57] + - [1245, 1911.57] - - [1024, 1120, 1, 1024] - - [1126, 7289.13] + - [1200, 7289.13] - - [4096, 49, 1, 3072] - - [1165, 5751.62] + - [1239, 5751.62] - - [1024, 24, 1, 1024] - - [1171, 1392.02] + - [1245, 1392.02] - - [1024, 2944, 1, 1024] - - [1147, 9284.93] + - [1221, 9284.93] - - [36548, 14080, 1, 1024] - - [1138, 10441.4] + - [1212, 10441.4] - - [1024, 1, 1, 1024] - - [1158, 0.1] + - [1232, 0.1] - - [1024, 1280, 1, 1024] - - [1126, 8244.46] + - [1200, 8244.46] - - [1024, 13440, 1, 1024] - - [1138, 9799.92] + - [1212, 9799.92] - - [1024, 1015, 1, 1024] - - [1146, 9187.85] + - [1220, 9187.85] - - [36548, 9120, 1, 1024] - - [1138, 10400.0] + - [1212, 10400.0] - - [36548, 1, 1, 1024] - - [1158, 0.1] + - [1232, 0.1] - - [1024, 3008, 1, 1024] - - [1147, 9468.55] + - [1221, 9468.55] - - [1024, 2560, 1, 1024] - - [1144, 8879.31] + - [1218, 8879.31] - - [1024, 21, 1, 1024] - - [1170, 1234.41] + - [1244, 1234.41] - - [1024, 2208, 1, 1024] - - [1126, 8231.27] + - [1200, 8231.27] - - [1024, 96, 1, 1024] - - [1176, 3767.44] + - [1250, 3767.44] - - [4096, 86, 1, 2048] - - [1127, 5529.09] + - [1201, 5529.09] - - [4096, 96, 1, 3072] - - [1126, 6273.28] + - [1200, 6273.28] - - [1024, 1920, 1, 1024] - - [1156, 9118.19] + - [1230, 9118.19] - - [4096, 27, 1, 2048] - - [1159, 4073.7] + - [1233, 4073.7] - - [36548, 2496, 1, 1024] - - [1138, 10361.2] + - [1212, 10361.2] - - [1024, 1, 1, 14] - - [1158, 0.1] + - [1232, 0.1] - - [1024, 91, 1, 1024] - - [1178, 3647.67] + - [1252, 3647.67] - - [1024, 2016, 1, 1024] - - [1154, 9560.24] + - [1228, 9560.24] - - [1024, 1184, 1, 1024] - - [1127, 7678.96] + - [1201, 7678.96] - - [4096, 1, 1, 2048] - - [1158, 0.1] + - [1232, 0.1] - - [1024, 1664, 1, 1024] - - [1152, 7934.07] + - [1226, 7934.07] - - [1024, 11424, 1, 1024] - - [1144, 9777.91] + - [1218, 9777.91] - - [4096, 24, 1, 3072] - - [1162, 3813.1] + - [1236, 3813.1] - - [1024, 1216, 1, 1024] - - [1126, 7902.13] + - [1200, 7902.13] - - [36548, 3185, 1, 1024] - - [1138, 10336.7] + - [1212, 10336.7] - - [36548, 9216, 1, 1024] - - [1138, 10414.3] + - [1212, 10414.3] - - [1024, 3200, 1, 1024] - - [1144, 8847.01] + - [1218, 8847.01] - - [1024, 2656, 1, 1024] - - [1139, 8649.25] + - [1213, 8649.25] - - [1024, 2368, 1, 1024] - - [1139, 8873.16] + - [1213, 8873.16] - - [1024, 4459, 1, 1024] - - [1146, 9431.32] + - [1220, 9431.32] - - [1024, 3808, 1, 1024] - - [1146, 9263.72] + - [1220, 9263.72] - - [1024, 2336, 1, 1024] - - [1139, 8966.0] + - [1213, 8966.0] - - [4096, 27, 1, 3072] - - [1159, 4171.74] + - [1233, 4171.74] - - [1024, 2304, 1, 1024] - - [1136, 8601.38] + - [1210, 8601.38] - - [1024, 1560, 1, 1024] - - [1151, 7481.74] + - [1225, 7481.74] - - [4096, 35, 1, 3072] - - [1165, 4176.9] + - [1239, 4176.9] - - [1024, 2496, 1, 1024] - - [1142, 9092.86] + - [1216, 9092.86] - - [1024, 1504, 1, 1024] - - [1142, 9220.53] + - [1216, 9220.53] - - [4096, 50, 1, 2048] - - [1166, 5472.83] + - [1240, 5472.83] - - [1024, 3232, 1, 1024] - - [1139, 8961.94] + - [1213, 8961.94] - - [1024, 14, 1, 1024] - - [1170, 882.315] + - [1244, 882.315] - - [36548, 1015, 1, 1024] - - [1138, 10140.9] + - [1212, 10140.9] - - [1024, 2000, 1, 1024] - - [1150, 9487.8] + - [1224, 9487.8] - - [36548, 243, 1, 1024] - - [1143, 9441.12] + - [1217, 9441.12] - - [36548, 32, 1, 1024] - - [1131, 4721.05] + - [1205, 4721.05] - - [1024, 25, 1, 1024] - - [1177, 1462.96] + - [1251, 1462.96] - - [1024, 13184, 1, 1024] - - [1141, 9866.28] + - [1215, 9866.28] - - [1024, 2688, 1, 1024] - - [1136, 8559.93] + - [1210, 8559.93] - - [1024, 27, 1, 1024] - - [1175, 1559.11] + - [1249, 1559.11] - - [36548, 950, 1, 1024] - - [1145, 10053.6] + - [1219, 10053.6] - - [1024, 1764, 1, 1024] - - [1152, 8347.11] + - [1226, 8347.11] - - [1024, 992, 1, 1024] - - [1139, 9035.82] + - [1213, 9035.82] - - [1024, 1376, 1, 1024] - - [1139, 8797.96] + - [1213, 8797.96] - - [1024, 950, 1, 1024] - - [1146, 8635.26] + - [1220, 8635.26] - - [36548, 774, 1, 1024] - - [1138, 9460.82] + - [1212, 9460.82] - - [36548, 25, 1, 1024] - - [1131, 3694.16] + - [1205, 3694.16] - - [1024, 4256, 1, 1024] - - [1139, 9172.16] + - [1213, 9172.16] - - [4096, 32, 1, 3072] - - [1160, 4886.67] + - [1234, 4886.67] - - [1024, 243, 1, 1024] - - [1164, 6594.41] + - [1238, 6594.41] - - [36548, 3712, 1, 1024] - - [1138, 10401.6] + - [1212, 10401.6] - - [1024, 50, 1, 1024] - - [1173, 2742.19] + - [1247, 2742.19] - - [1024, 3360, 1, 1024] - - [1135, 9017.37] + - [1209, 9017.37] - - [1024, 2048, 1, 1024] - - [1150, 9736.65] + - [1224, 9736.65] - - [1024, 2784, 1, 1024] - - [1146, 8835.6] + - [1220, 8835.6] - - [1024, 4992, 1, 1024] - - [1144, 9639.38] + - [1218, 9639.38] - - [36548, 1102, 1, 1024] - - [1145, 9859.04] + - [1219, 9859.04] - - [1024, 1536, 1, 1024] - - [1137, 9294.98] + - [1211, 9294.98] - - [1024, 2720, 1, 1024] - - [1142, 8617.88] + - [1216, 8617.88] - - [4096, 1, 1, 3072] - - [1158, 0.1] + - [1232, 0.1] - - [1024, 2752, 1, 1024] - - [1146, 8902.17] + - [1220, 8902.17] - - [1024, 2816, 1, 1024] - - [1144, 8906.95] + - [1218, 8906.95] - - [1024, 2624, 1, 1024] - - [1146, 8494.41] + - [1220, 8494.41] - - [1024, 2144, 1, 1024] - - [1129, 8243.56] + - [1203, 8243.56] - - [36548, 1131, 1, 1024] - - [1145, 10104.6] + - [1219, 10104.6] - - [4096, 25, 1, 3072] - - [1160, 3959.98] + - [1234, 3959.98] - - [1024, 64, 1, 1024] - - [1173, 3410.1] + - [1247, 3410.1] - - [1024, 3296, 1, 1024] - - [1144, 9066.52] + - [1218, 9066.52] - - [36548, 4992, 1, 1024] - - [1138, 10395.6] + - [1212, 10395.6] - - [1024, 1344, 1, 1024] - - [1139, 8522.66] + - [1213, 8522.66] - - [36548, 2401, 1, 1024] - - [1138, 10250.3] + - [1212, 10250.3] - - [1024, 15744, 1, 1024] - - [1138, 10006.4] + - [1212, 10006.4] - - [1024, 15232, 1, 1024] - - [1137, 9912.21] + - [1211, 9912.21] - - [1024, 1888, 1, 1024] - - [1149, 8962.98] + - [1223, 8962.98] - - [1024, 1792, 1, 1024] - - [1153, 8556.82] + - [1227, 8556.82] - - [36548, 1073, 1, 1024] - - [1138, 10161.2] + - [1212, 10161.2] - - [4096, 50, 1, 3072] - - [1165, 5882.16] + - [1239, 5882.16] - - [36548, 15488, 1, 1024] - - [1145, 10437.1] + - [1219, 10437.1] - - [1024, 2464, 1, 1024] - - [1142, 8880.02] + - [1216, 8880.02] - - [1024, 2272, 1, 1024] - - [1139, 8720.35] + - [1213, 8720.35] - - [1024, 13, 1, 1024] - - [1169, 774.616] + - [1243, 774.616] - - [1024, 2432, 1, 1024] - - [1144, 8491.53] + - [1218, 8491.53] - - [36548, 24, 1, 1024] - - [1131, 3564.41] + - [1205, 3564.41] - - [1024, 3936, 1, 1024] - - [1154, 9433.3] + - [1228, 9433.3] - - [36548, 13824, 1, 1024] - - [1138, 10439.8] + - [1212, 10439.8] - - [1024, 2401, 1, 1024] - - [1146, 8870.03] + - [1220, 8870.03] - - [1024, 32, 1, 1024] - - [1161, 1839.71] + - [1235, 1839.71] - - [1024, 2176, 1, 1024] - - [1130, 8544.55] + - [1204, 8544.55] - - [1024, 2240, 1, 1024] - - [1139, 8381.55] + - [1213, 8381.55] - - [1024, 1728, 1, 1024] - - [1127, 8212.33] + - [1201, 8212.33] - - [1024, 128, 1, 1024] - - [1174, 4660.44] + - [1248, 4660.44] - - [1024, 216, 1, 1024] - - [1164, 5777.97] + - [1238, 5777.97] - - [1024, 63, 1, 1024] - - [1172, 3329.75] + - [1246, 3329.75] - - [1024, 86, 1, 1024] - - [1178, 3533.7] + - [1252, 3533.7] - - [1024, 2528, 1, 1024] - - [1134, 8789.25] + - [1208, 8789.25] - - [1024, 2400, 1, 1024] - - [1139, 8939.4] + - [1213, 8939.4] - - [1024, 1440, 1, 1024] - - [1146, 9131.41] + - [1220, 9131.41] - - [1024, 2912, 1, 1024] - - [1139, 9140.03] + - [1213, 9140.03] - - [4096, 35, 1, 2048] - - [1165, 4059.85] + - [1239, 4059.85] - - [4096, 63, 1, 2048] - - [1167, 6946.5] + - [1241, 6946.5] - - [1024, 2880, 1, 1024] - - [1137, 9104.98] + - [1211, 9104.98] - - [1024, 4064, 1, 1024] - - [1156, 9715.2] + - [1230, 9715.2] - - [1024, 4655, 1, 1024] - - [1144, 9033.9] + - [1218, 9033.9] - - [1024, 1088, 1, 1024] - - [1128, 8144.41] + - [1202, 8144.41] - - [36548, 6272, 1, 1024] - - [1145, 10427.4] + - [1219, 10427.4] - - [1024, 1, 1, 13] - - [1158, 0.1] + - [1232, 0.1] - - [768, 512, 1, 768] - - [1182, 5889.14] + - [1256, 5889.14] - - [768, 2048, 1, 3072] - - [1192, 9394.72] + - [1266, 9394.72] - - [768, 32, 1, 768] - - [1204, 1502.84] + - [1278, 1502.84] - - [64, 128, 96, 128] - - [1199, 4973.58] + - [1273, 4973.58] - - [3072, 1024, 1, 768] - - [1193, 9856.17] + - [1267, 9856.17] - - [768, 1024, 1, 3072] - - [1186, 8611.16] + - [1260, 8611.16] - - [768, 512, 1, 3072] - - [1185, 6430.89] + - [1259, 6430.89] - - [768, 64, 1, 768] - - [1206, 2621.54] + - [1280, 2621.54] - - [768, 4096, 1, 3072] - - [1191, 10030.5] + - [1265, 10030.5] - - [768, 2048, 1, 2] - - [1184, 381.863] + - [1258, 381.863] - - [768, 2048, 1, 768] - - [1189, 9754.3] + - [1263, 9754.3] - - [768, 320, 1, 30522] - - [1202, 8529.5] + - [1276, 8529.5] - - [64, 64, 96, 64] - - [1196, 2496.71] + - [1270, 2496.71] - - [768, 640, 1, 30522] - - [1183, 8253.94] + - [1257, 8253.94] - - [768, 1280, 1, 30522] - - [1188, 9572.95] + - [1262, 9572.95] - - [768, 1280, 1, 768] - - [1192, 8714.03] + - [1266, 8714.03] - - [768, 640, 1, 768] - - [1182, 7293.13] + - [1256, 7293.13] - - [768, 32, 1, 2] - - [1194, 11.9154] + - [1268, 11.9154] - - [3072, 2048, 1, 768] - - [1189, 10019.7] + - [1263, 10019.7] - - [768, 4096, 1, 768] - - [1189, 9927.45] + - [1263, 9927.45] - - [3072, 4096, 1, 768] - - [1192, 10150.2] + - [1266, 10150.2] - - [64, 256, 192, 256] - - [1198, 7054.29] + - [1272, 7054.29] - - [768, 8, 1, 768] - - [1205, 341.039] + - [1279, 341.039] - - [64, 128, 384, 128] - - [1197, 6765.11] + - [1271, 6765.11] - - [768, 1024, 1, 768] - - [1187, 8768.68] + - [1261, 8768.68] - - [768, 320, 1, 768] - - [1203, 6838.64] + - [1277, 6838.64] - - [64, 64, 768, 64] - - [1200, 5388.93] + - [1274, 5388.93] - - [768, 1024, 1, 2] - - [1180, 258.795] + - [1254, 258.795] - - [768, 16, 1, 768] - - [1205, 819.3] + - [1279, 819.3] - - [64, 256, 96, 256] - - [1198, 5893.74] + - [1272, 5893.74] - - [3072, 512, 1, 768] - - [1190, 9722.89] + - [1264, 9722.89] - - [768, 160, 1, 768] - - [1207, 5019.88] + - [1281, 5019.88] - - [768, 4096, 1, 2] - - [1181, 507.475] + - [1255, 507.475] - - [1600, 512, 1, 1024] - - [1211, 7187.05] + - [1285, 7187.05] - - [1024, 512, 1, 64] - - [1209, 2557.6] + - [1283, 2557.6] - - [1024, 512, 1, 1] - - [1208, 71.3348] + - [1282, 71.3348] - - [2048, 512, 1, 1] - - [1210, 90.4945] + - [1284, 90.4945] - - [1024, 200, 1, 1] - - [1216, 40.1] + - [1290, 40.1] - - [32, 200, 1, 1] - - [1212, 1.66863] + - [1286, 1.66863] - - [560, 200, 1, 1024] - - [1220, 4731.45] + - [1294, 4731.45] - - [1, 512, 1, 1] - - [1219, 0.230612] + - [1293, 0.230612] - - [64, 512, 1, 1] - - [1214, 7.68519] + - [1288, 7.68519] - - [1024, 8192, 1, 256] - - [1229, 9519.09] + - [1303, 9519.09] - - [1024, 22016, 1, 256] - - [1235, 9881.22] + - [1309, 9881.22] - - [256, 8976, 1, 4352] - - [1227, 9567.18] + - [1301, 9567.18] - - [512, 256, 1, 2048] - - [1240, 5917.99] + - [1314, 5917.99] - - [1024, 19968, 1, 256] - - [1235, 9882.47] + - [1309, 9882.47] - - [256, 8976, 1, 1536] - - [1225, 8437.45] + - [1299, 8437.45] - - [256, 8976, 1, 33536] - - [1225, 8441.99] + - [1299, 8441.99] - - [1024, 1792, 1, 256] - - [1225, 7757.07] + - [1299, 7757.07] - - [1024, 21504, 1, 256] - - [1235, 9894.0] + - [1309, 9894.0] - - [512, 215, 1, 2048] - - [1241, 4665.74] + - [1315, 4665.74] - - [1024, 7168, 1, 256] - - [1229, 9509.45] + - [1303, 9509.45] - - [256, 8976, 1, 15872] - - [1231, 8914.75] + - [1305, 8914.75] - - [1024, 19712, 1, 256] - - [1235, 9772.0] + - [1309, 9772.0] - - [256, 8976, 1, 5632] - - [1231, 8740.13] + - [1305, 8740.13] - - [1024, 14848, 1, 256] - - [1235, 9756.25] + - [1309, 9756.25] - - [1024, 28672, 1, 256] - - [1235, 9959.02] + - [1309, 9959.02] - - [256, 8976, 1, 9728] - - [1238, 8853.14] + - [1312, 8853.14] - - [1024, 17152, 1, 256] - - [1229, 9737.4] + - [1303, 9737.4] - - [256, 8976, 1, 11520] - - [1231, 8999.3] + - [1305, 8999.3] - - [256, 8976, 1, 8192] - - [1221, 7897.42] + - [1295, 7897.42] - - [1024, 3328, 1, 256] - - [1236, 8593.63] + - [1310, 8593.63] - - [256, 8976, 1, 7424] - - [1231, 8980.57] + - [1305, 8980.57] - - [1024, 18944, 1, 256] - - [1235, 9854.95] + - [1309, 9854.95] - - [1024, 10496, 1, 256] - - [1230, 9454.0] + - [1304, 9454.0] - - [256, 8976, 1, 5376] - - [1228, 9608.47] + - [1302, 9608.47] - - [256, 8976, 1, 6144] - - [1225, 7880.23] + - [1299, 7880.23] - - [1024, 40448, 1, 256] - - [1235, 10016.7] + - [1309, 10016.7] - - [256, 8976, 1, 22016] - - [1238, 8939.97] + - [1312, 8939.97] - - [256, 8976, 1, 4864] - - [1226, 9211.53] + - [1300, 9211.53] - - [256, 8976, 1, 12288] - - [1222, 8065.15] + - [1296, 8065.15] - - [1024, 9728, 1, 256] - - [1235, 9636.35] + - [1309, 9636.35] - - [256, 8976, 1, 2048] - - [1223, 7001.43] + - [1297, 7001.43] - - [1024, 10240, 1, 256] - - [1229, 9620.06] + - [1303, 9620.06] - - [256, 8976, 1, 2304] - - [1227, 9509.84] + - [1301, 9509.84] - - [1024, 7936, 1, 256] - - [1235, 9300.77] + - [1309, 9300.77] - - [768, 256, 1, 2048] - - [1239, 6268.05] + - [1313, 6268.05] - - [1024, 9984, 1, 256] - - [1235, 9477.38] + - [1309, 9477.38] - - [1024, 13312, 1, 256] - - [1235, 9758.66] + - [1309, 9758.66] - - [1024, 16128, 1, 256] - - [1229, 9722.0] + - [1303, 9722.0] - - [1024, 8960, 1, 256] - - [1230, 9398.35] + - [1304, 9398.35] - - [1024, 5120, 1, 256] - - [1236, 9315.6] + - [1310, 9315.6] - - [1024, 11264, 1, 256] - - [1229, 9664.9] + - [1303, 9664.9] - - [256, 8976, 1, 20480] - - [1237, 8279.97] + - [1311, 8279.97] - - [1024, 20992, 1, 256] - - [1229, 9878.97] + - [1303, 9878.97] - - [256, 8976, 1, 9472] - - [1231, 8991.06] + - [1305, 8991.06] - - [256, 8976, 1, 8448] - - [1231, 8983.62] + - [1305, 8983.62] - - [256, 8976, 1, 20992] - - [1232, 8942.21] + - [1306, 8942.21] - - [256, 8976, 1, 10496] - - [1232, 8989.81] + - [1306, 8989.81] - - [1024, 15104, 1, 256] - - [1230, 9676.11] + - [1304, 9676.11] - - [1024, 6400, 1, 256] - - [1238, 9145.99] + - [1312, 9145.99] - - [1024, 4096, 1, 256] - - [1231, 9124.35] + - [1305, 9124.35] - - [256, 8976, 1, 2560] - - [1225, 8566.21] + - [1299, 8566.21] - - [256, 8976, 1, 2816] - - [1227, 9496.94] + - [1301, 9496.94] - - [1024, 7680, 1, 256] - - [1235, 9460.94] + - [1309, 9460.94] - - [256, 8976, 1, 14336] - - [1232, 8226.9] + - [1306, 8226.9] - - [256, 8976, 1, 6656] - - [1232, 8771.52] + - [1306, 8771.52] - - [1024, 3072, 1, 256] - - [1232, 9077.04] + - [1306, 9077.04] - - [256, 8976, 1, 5888] - - [1228, 9546.4] + - [1302, 9546.4] - - [1024, 12288, 1, 256] - - [1229, 9690.91] + - [1303, 9690.91] - - [256, 8976, 1, 26112] - - [1234, 8699.93] + - [1308, 8699.93] - - [1024, 7424, 1, 256] - - [1236, 9256.94] + - [1310, 9256.94] - - [256, 8976, 1, 14848] - - [1237, 8885.89] + - [1311, 8885.89] - - [768, 215, 1, 2048] - - [1239, 5628.69] + - [1313, 5628.69] - - [1024, 2560, 1, 256] - - [1232, 8820.93] + - [1306, 8820.93] - - [256, 8976, 1, 19968] - - [1231, 8928.96] + - [1305, 8928.96] - - [256, 8976, 1, 9984] - - [1231, 8993.22] + - [1305, 8993.22] - - [1024, 4864, 1, 256] - - [1232, 8974.4] + - [1306, 8974.4] - - [1024, 33536, 1, 256] - - [1235, 9943.17] + - [1309, 9943.17] - - [256, 8976, 1, 15104] - - [1232, 8996.73] + - [1306, 8996.73] - - [1024, 2048, 1, 256] - - [1230, 8462.76] + - [1304, 8462.76] - - [256, 8976, 1, 8960] - - [1232, 8999.02] + - [1306, 8999.02] - - [1024, 6144, 1, 256] - - [1237, 9359.77] + - [1311, 9359.77] - - [1024, 14592, 1, 256] - - [1235, 9667.52] + - [1309, 9667.52] - - [256, 8976, 1, 19712] - - [1231, 9020.21] + - [1305, 9020.21] - - [1024, 11520, 1, 256] - - [1230, 9527.8] + - [1304, 9527.8] - - [1024, 5632, 1, 256] - - [1229, 9297.3] + - [1303, 9297.3] - - [256, 8976, 1, 11008] - - [1238, 8994.9] + - [1312, 8994.9] - - [256, 8976, 1, 17152] - - [1232, 9003.9] + - [1306, 9003.9] - - [256, 8976, 1, 3072] - - [1221, 8262.06] + - [1295, 8262.06] - - [1024, 3840, 1, 256] - - [1238, 8671.99] + - [1312, 8671.99] - - [1024, 14336, 1, 256] - - [1235, 9760.38] + - [1309, 9760.38] - - [1024, 20480, 1, 256] - - [1229, 9887.95] + - [1303, 9887.95] - - [1024, 23552, 1, 256] - - [1229, 9890.56] + - [1303, 9890.56] - - [256, 8976, 1, 7168] - - [1224, 8478.44] + - [1298, 8478.44] - - [1024, 13568, 1, 256] - - [1229, 9654.74] + - [1303, 9654.74] - - [1024, 4608, 1, 256] - - [1237, 9218.35] + - [1311, 9218.35] - - [256, 8976, 1, 10240] - - [1222, 8076.26] + - [1296, 8076.26] - - [1024, 8704, 1, 256] - - [1231, 9475.6] + - [1305, 9475.6] - - [1024, 11008, 1, 256] - - [1235, 9525.06] + - [1309, 9525.06] - - [1024, 8448, 1, 256] - - [1229, 9352.26] + - [1303, 9352.26] - - [256, 8976, 1, 44505] - - [1233, 8430.33] + - [1307, 8430.33] + - - [6272, 256, 1, 528] + - [1359, 7390.04] + - - [3136, 2048, 1, 1024] + - [1340, 9658.04] + - - [6272, 112, 1, 512] + - [1338, 5931.19] + - - [2048, 320, 1, 1280] + - [1358, 7773.09] + - - [289, 256, 1, 1568] + - [1379, 3718.27] + - - [3136, 64, 64, 64] + - [1318, 8201.25] + - - [50176, 128, 1, 256] + - [1341, 8908.68] + - - [5329, 64, 1, 448] + - [1324, 4602.3] + - - [289, 192, 1, 1344] + - [1376, 3452.69] + - - [12544, 1024, 1, 256] + - [1341, 9742.74] + - - [784, 64, 32, 192] + - [1317, 6844.71] + - - [6272, 64, 1, 480] + - [1325, 5562.34] + - - [196, 128, 1, 800] + - [1367, 1639.84] + - - [64, 512, 1, 1344] + - [1366, 2313.14] + - - [6272, 64, 1, 512] + - [1324, 5609.29] + - - [6272, 160, 1, 528] + - [1325, 6149.8] + - - [289, 160, 32, 768] + - [1352, 6637.92] + - - [12544, 256, 1, 1024] + - [1359, 8790.56] + - - [289, 224, 1, 1568] + - [1379, 3270.27] + - - [5329, 64, 32, 160] + - [1332, 9091.14] + - - [5329, 96, 1, 576] + - [1359, 5555.76] + - - [3025, 64, 1, 363] + - [1377, 4392.4] + - - [784, 32, 32, 192] + - [1348, 5633.9] + - - [3136, 512, 1, 1024] + - [1344, 7553.24] + - - [6272, 16, 1, 480] + - [1379, 3219.95] + - - [1225, 64, 32, 288] + - [1339, 8240.68] + - - [64, 256, 1, 1536] + - [1372, 1456.46] + - - [289, 192, 32, 768] + - [1351, 7372.9] + - - [2048, 448, 1, 1280] + - [1334, 8403.11] + - - [3136, 2048, 1, 512] + - [1333, 9486.41] + - - [289, 256, 1, 2016] + - [1379, 3876.18] + - - [289, 384, 32, 1024] + - [1318, 7350.64] + - - [1568, 32, 1, 832] + - [1368, 2717.97] + - - [3136, 64, 32, 64] + - [1321, 7657.36] + - - [289, 160, 1, 1120] + - [1375, 2827.0] + - - [6272, 128, 1, 528] + - [1329, 6926.36] + - - [21609, 32, 1, 288] + - [1330, 3699.0] + - - [1225, 192, 1, 1728] + - [1363, 7309.91] + - - [4096, 512, 1, 4096] + - [1346, 10272.2] + - - [64, 256, 1, 1152] + - [1372, 1387.92] + - - [6272, 96, 1, 480] + - [1360, 6371.66] + - - [784, 96, 1, 800] + - [1380, 3330.37] + - - [2048, 448, 1, 2048] + - [1334, 8622.75] + - - [784, 96, 32, 192] + - [1349, 7092.46] + - - [3136, 64, 64, 256] + - [1342, 9579.26] + - - [289, 224, 1, 1344] + - [1379, 3180.11] + - - [1001, 512, 1, 4096] + - [1320, 8195.17] + - - [2048, 192, 1, 1280] + - [1325, 6120.19] + - - [1225, 64, 32, 256] + - [1330, 8076.72] + - - [2048, 256, 1, 1536] + - [1320, 8137.8] + - - [1225, 64, 1, 1200] + - [1379, 3552.97] + - - [6272, 128, 1, 512] + - [1333, 6878.31] + - - [729, 192, 1, 1600] + - [1378, 5016.87] + - - [289, 192, 1, 896] + - [1376, 3091.97] + - - [1568, 384, 1, 832] + - [1359, 6934.72] + - - [784, 16, 32, 192] + - [1350, 3380.38] + - - [1568, 256, 1, 832] + - [1324, 5980.96] + - - [1568, 48, 1, 832] + - [1381, 3275.19] + - - [1568, 192, 1, 832] + - [1319, 4441.21] + - - [289, 192, 32, 1024] + - [1322, 6563.16] + - - [6272, 32, 1, 528] + - [1363, 4998.77] + - - [49, 128, 1, 1200] + - [1364, 550.275] + - - [1225, 64, 32, 384] + - [1336, 8589.43] + - - [289, 128, 1, 896] + - [1375, 2103.2] + - - [1568, 160, 1, 832] + - [1363, 6995.15] + - - [1001, 32, 1, 1024] + - [1372, 1744.82] + - - [2048, 320, 1, 2048] + - [1357, 7118.14] + - - [2048, 384, 1, 1536] + - [1320, 8184.11] + - - [50176, 512, 1, 256] + - [1332, 9852.5] + - - [289, 256, 1, 1792] + - [1381, 3809.85] + - - [64, 448, 1, 1152] + - [1373, 2128.33] + - - [5041, 96, 1, 576] + - [1358, 5279.4] + - - [6272, 192, 1, 480] + - [1320, 7479.75] + - - [784, 32, 32, 256] + - [1347, 5709.01] + - - [1001, 32, 1, 2048] + - [1374, 2141.14] + - - [289, 192, 1, 1120] + - [1370, 3277.87] + - - [6272, 32, 1, 512] + - [1362, 4978.8] + - - [289, 384, 1, 3456] + - [1379, 5904.24] + - - [289, 384, 1, 2592] + - [1380, 5707.44] + - - [784, 128, 64, 512] + - [1326, 8864.49] + - - [12544, 1024, 1, 512] + - [1341, 10008.4] + - - [12544, 256, 1, 512] + - [1359, 8628.18] + - - [6272, 24, 1, 512] + - [1363, 3568.17] + - - [5041, 192, 1, 720] + - [1334, 8424.52] + - - [64, 320, 1, 1728] + - [1367, 1469.76] + - - [784, 128, 32, 256] + - [1335, 8104.24] + - - [289, 96, 1, 864] + - [1373, 1838.35] + - - [1225, 32, 32, 192] + - [1354, 5949.82] + - - [1568, 128, 1, 832] + - [1362, 5718.79] + - - [289, 128, 32, 768] + - [1320, 7289.35] + - - [3136, 256, 64, 64] + - [1328, 9104.02] + - - [196, 64, 1, 800] + - [1366, 915.72] + - - [4096, 512, 1, 9216] + - [1343, 10351.5] + - - [12544, 64, 1, 147] + - [1333, 5069.43] + - - [784, 32, 1, 400] + - [1364, 1140.46] + - - [6272, 160, 1, 512] + - [1324, 6140.18] + - - [1225, 48, 32, 288] + - [1330, 5978.71] + - - [64, 320, 1, 2880] + - [1371, 1920.1] + - - [1225, 64, 32, 192] + - [1324, 7641.11] + - - [1001, 32, 1, 1536] + - [1372, 2084.89] + - - [784, 64, 32, 256] + - [1316, 6990.61] + - - [64, 384, 1, 1152] + - [1373, 1862.7] + - - [784, 512, 64, 128] + - [1327, 9026.05] + - - [3136, 512, 1, 2048] + - [1345, 7764.4] + - - [6272, 144, 1, 512] + - [1320, 5574.14] + - - [1225, 192, 32, 384] + - [1334, 9373.93] + - - [64, 192, 1, 1728] + - [1372, 1206.56] + - - [8192, 320, 1, 1280] + - [1386, 9876.02] + - - [8192, 320, 1, 2048] + - [1389, 9745.8] + - - [8192, 384, 1, 1280] + - [1386, 10046.3] + - - [8192, 192, 1, 1280] + - [1389, 9951.0] + - - [8192, 192, 1, 2048] + - [1385, 9559.77] + - - [8192, 384, 1, 2048] + - [1387, 9945.84] + - - [8192, 448, 1, 2048] + - [1388, 9908.61] + - - [1001, 64, 1, 1536] + - [1382, 3650.04] + - - [8192, 448, 1, 1280] + - [1386, 9981.45] + - - [1001, 64, 1, 2048] + - [1383, 3580.97] + - - [1001, 128, 1, 2048] + - [1384, 5587.97] - null diff --git a/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Alik_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Alik_Bljk_SB.yaml index eb99e9a3c..a78fe0364 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Alik_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Alik_Bljk_SB.yaml @@ -65675,24 +65675,24 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -65700,32 +65700,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2304 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -65734,36 +65739,46 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65773,6 +65788,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -65782,53 +65798,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 413 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW02_GSU32_SNLL0_TT04_02_VW02_WG16_04_04 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: [4, 2] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -65836,76 +65863,86 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -65913,6 +65950,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65922,6 +65960,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -65931,133 +65970,159 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 414 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_02_08 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 2 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id001 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 2, 8] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 5120 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66067,6 +66132,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -66076,53 +66142,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 415 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id002 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -66130,8 +66207,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -66139,67 +66216,77 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 13312 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1024 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -66207,6 +66294,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66216,6 +66304,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -66225,53 +66314,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 416 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id002 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -66279,39 +66379,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 12 - LSPB: 12 - LVCA: 16 - LVCB: 16 - LVPA: 12 - LVPB: 12 - LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 768 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 576 - LdsOffsetB_Blk: 2624 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -66324,31 +66425,40 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 36 - MacroTile1: 48 - MacroTileA: 36 - MacroTileB: 48 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -66356,6 +66466,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66365,6 +66476,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -66374,47 +66486,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 417 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT036x048x16_GRVW01_GSU02_SNLL0_TT03_03_VW01_WG12_16_01 - SubGroup0: 12 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 12 + SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id004 - ThreadTile0: 3 - ThreadTile1: 3 - ThreadTileA: 3 - ThreadTileB: 3 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [12, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingType: B + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -66426,85 +66549,92 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 12 - LSPB: 12 - LVCA: 16 - LVCB: 16 - LVPA: 12 - LVPB: 12 - LdcEqualsLdd: false - LdsNumElements: 3456 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 576 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 36 - MacroTileA: 48 - MacroTileB: 36 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66514,6 +66644,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -66523,47 +66654,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 418 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x036x16_GRVW01_GSU08_SNLL0_TT06_03_VW01_WG08_12_02 - SubGroup0: 8 - SubGroup1: 12 - SubGroupA: 8 - SubGroupB: 12 - SuppresssNoLoadLoop: false - ThreadTile: [6, 3] - ThreadTile0: 6 - ThreadTile1: 3 - ThreadTileA: 6 - ThreadTileB: 3 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id003 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -66577,76 +66719,86 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 24 - LSPB: 24 - LVCA: 8 - LVCB: 8 - LVPA: 12 - LVPB: 12 - LdcEqualsLdd: false - LdsNumElements: 4608 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 768 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 48 - MacroTileA: 48 - MacroTileB: 48 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 6 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -66654,6 +66806,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66663,6 +66816,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -66672,47 +66826,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 419 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW02_GSU08_SNLL0_TT06_04_VW02_WG08_12_02 - SubGroup0: 8 - SubGroup1: 12 - SubGroupA: 8 - SubGroupB: 12 - SuppresssNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id003 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -66726,39 +66891,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 768 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -66772,30 +66938,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 48 - MacroTile1: 48 - MacroTileA: 48 - MacroTileB: 48 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -66803,6 +66978,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66812,6 +66988,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -66821,49 +66998,60 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 420 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW01_GSU08_SNLL0_TT03_03_VW01_WG16_16_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id004 - ThreadTile0: 3 - ThreadTile1: 3 - ThreadTileA: 3 - ThreadTileB: 3 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -66873,10 +67061,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -66887,27 +67075,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 16 - LSPB: 8 + LSPA: 64 + LSPB: 64 LVCA: 4 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 832 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -66921,37 +67110,45 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66961,6 +67158,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -66970,49 +67168,62 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 421 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id009 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -67022,8 +67233,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -67036,27 +67247,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 16 - LSPB: 16 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -67070,30 +67282,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -67101,6 +67320,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -67110,6 +67330,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -67119,49 +67340,33707 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 422 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 423 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS0_FL1_GRVW2_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 424 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 425 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 426 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 427 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 96 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 24 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 428 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT6_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 96 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 24 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 429 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT6_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 430 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 431 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 432 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 433 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 434 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 435 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT8_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 436 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 437 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 438 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS0_FL1_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT8_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 439 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 440 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 441 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 442 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 443 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 444 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 445 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 446 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 447 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 448 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 449 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_FL0_GRVW1_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 450 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 451 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 452 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 453 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 454 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 455 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 456 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 457 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 458 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 459 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 460 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 461 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 462 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3200 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 128 + LdsOffsetB_Blk: 2176 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 64 + MacroTileA: 8 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 463 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 464 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 465 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 466 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 467 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 468 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 469 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 470 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 471 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 472 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 473 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 474 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 475 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 476 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 477 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 478 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 479 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 480 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA4_LPB4_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 481 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT4_2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 482 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 483 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 484 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 4 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 485 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA4_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 486 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT8_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 487 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 488 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 489 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 490 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 491 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 492 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 493 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 494 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 495 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 496 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR0_TT4_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 497 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW1_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 498 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB0_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 499 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 500 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 501 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 502 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 503 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 504 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 505 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 506 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 507 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU4_LPA0_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 508 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 509 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 510 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 511 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 512 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 513 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3200 + LdsOffsetA: 0 + LdsOffsetB: 2112 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 514 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 515 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT8_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 516 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 517 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 518 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2304 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 519 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW02_GSU32_SNLL0_TT04_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 520 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_02_08 + SubGroup0: 16 + SubGroup1: 2 + SubGroupA: 16 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 2, 8] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 5120 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 521 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 13312 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 522 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 12 + LSPB: 12 + LVCA: 16 + LVCB: 16 + LVPA: 12 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 3392 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 36 + MacroTile1: 48 + MacroTileA: 36 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 523 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT036x048x16_GRVW01_GSU02_SNLL0_TT03_03_VW01_WG12_16_01 + SubGroup0: 12 + SubGroup1: 16 + SubGroupA: 12 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id004 + ThreadTile0: 3 + ThreadTile1: 3 + ThreadTileA: 3 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [12, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 12 + LSPB: 12 + LVCA: 16 + LVCB: 16 + LVPA: 12 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 3456 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 36 + MacroTileA: 48 + MacroTileB: 36 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 524 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x036x16_GRVW01_GSU08_SNLL0_TT06_03_VW01_WG08_12_02 + SubGroup0: 8 + SubGroup1: 12 + SubGroupA: 8 + SubGroupB: 12 + SuppresssNoLoadLoop: false + ThreadTile: [6, 3] + ThreadTile0: 6 + ThreadTile1: 3 + ThreadTileA: 6 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: *id003 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 24 + LSPB: 24 + LVCA: 8 + LVCB: 8 + LVPA: 12 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 48 + MacroTileA: 48 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 6 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 525 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW02_GSU08_SNLL0_TT06_04_VW02_WG08_12_02 + SubGroup0: 8 + SubGroup1: 12 + SubGroupA: 8 + SubGroupB: 12 + SuppresssNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id003 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 48 + MacroTile1: 48 + MacroTileA: 48 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 526 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW01_GSU08_SNLL0_TT03_03_VW01_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id004 + ThreadTile0: 3 + ThreadTile1: 3 + ThreadTileA: 3 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 527 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id009 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 528 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 529 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 530 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 531 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 532 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 533 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 534 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 535 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 536 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id009 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 537 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 538 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 539 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 540 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 541 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 542 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 543 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 544 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id009 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 545 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 546 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 547 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 548 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 549 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU08_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 550 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 551 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 552 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 553 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 554 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 555 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 556 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 557 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 558 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 559 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 560 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 561 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 562 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 563 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 564 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 565 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id020 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 566 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 567 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 568 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 569 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 570 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 571 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 572 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 573 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 574 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 575 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 576 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 577 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 578 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 579 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 580 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 581 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 582 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 583 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 584 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 585 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 586 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 587 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 13312 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 588 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 589 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 590 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 591 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 592 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 593 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id020 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 594 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 595 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 596 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 597 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 598 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 599 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 600 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id023 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 601 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL0_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: [6, 8] + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 602 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id024 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 603 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 604 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id023 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 605 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id024 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 606 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 607 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 608 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 609 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 610 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id027 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 611 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 612 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id029 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 613 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 614 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id031 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 615 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 616 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id027 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 617 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 618 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id029 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 619 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 620 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id031 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 621 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 3072 + LdsNumElementsAlignedB: 3072 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 3072 + LdsOffsetB_Blk: 11264 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 622 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 3072 + LdsNumElementsAlignedB: 3072 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 3072 + LdsOffsetB_Blk: 11264 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 623 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 624 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id027 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 625 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 626 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 627 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id027 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 628 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 629 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 4 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 4 + LSCB: 4 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 630 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id032 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 2 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 2 + LSCB: 2 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 631 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id032 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 632 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id035 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 633 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id005 + ThreadTile: *id033 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id006 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -67171,10 +101050,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -67182,47 +101061,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 16 - LSPB: 16 - LVCA: 4 - LVCB: 4 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67230,13 +101109,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 64 PerformanceSyncLocation: -1 @@ -67282,35 +101161,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 423 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 634 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 + ThreadTile: *id033 + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id006 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -67320,58 +101199,207 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 635 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67379,8 +101407,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -67431,47 +101459,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 424 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 636 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id007 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id006 + VectorWidth: 2 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -67480,26 +101508,26 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 - LSPA: 32 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 LSPB: 16 - LVCA: 4 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 896 + LdsNumElements: 1024 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 512 LdsOffsetB: 256 @@ -67509,17 +101537,17 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -67528,15 +101556,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67580,85 +101608,83 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 425 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 - SubGroup0: 16 + SolutionIndex: 637 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id005 + ThreadTile: *id033 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id008 + WorkGroup: *id035 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 LSPA: 16 LSPB: 16 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -67666,10 +101692,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67677,15 +101703,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67729,96 +101753,95 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 426 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 + SolutionIndex: 638 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_08_02_WGM01 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id006 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: *id038 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67826,15 +101849,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67878,47 +101899,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 427 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 + SolutionIndex: 639 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: *id036 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id006 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: *id037 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -67926,37 +101946,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 6752 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -67964,9 +101984,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -67976,14 +101996,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68027,47 +102045,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 428 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SolutionIndex: 640 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 4 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: *id037 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -68075,48 +102092,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 16 + LVCA: 32 + LVCB: 32 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3200 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElements: 6752 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68125,14 +102142,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68176,96 +102191,95 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 429 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 + SolutionIndex: 641 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM08 + SubGroup0: 16 SubGroup1: 4 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: *id037 + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68273,15 +102287,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68325,96 +102337,95 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 430 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SolutionIndex: 642 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW01_GSU01_LPA02_LPB02_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_04_04_WGM01 SubGroup0: 16 SubGroup1: 4 SubGroupA: 16 SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id039 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id009 + WorkGroup: *id037 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68422,15 +102433,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68474,35 +102483,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 431 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionIndex: 643 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW01_GSU08_LPA02_LPB02_PGR1_PLR1_TT02_02_USFGRO01_VW02_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id006 + WorkGroup: *id037 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -68512,58 +102521,57 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68572,14 +102580,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68623,35 +102629,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 432 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 + SolutionIndex: 644 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_08_02_WGM01 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ThreadTile: *id036 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id006 + VectorWidth: 4 + WorkGroup: *id038 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -68661,43 +102667,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 16 LSPB: 16 - LVCA: 4 - LVCB: 4 - LVPA: 4 - LVPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -68709,10 +102714,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68720,15 +102725,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68772,35 +102775,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 433 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 + SolutionIndex: 645 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG32_08_01_WGM01 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 + ThreadTile: *id039 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id006 + WorkGroup: *id040 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -68810,43 +102813,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -68858,10 +102860,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68869,15 +102871,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68921,35 +102921,36 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 434 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 - SubGroup0: 16 + SolutionIndex: 646 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id039 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 + VectorWidth: 4 + WorkGroup: *id040 WorkGroupMapping: 1 WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -68959,41 +102960,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69007,10 +103008,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69018,15 +103019,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69070,35 +103074,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 435 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 647 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id008 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -69108,8 +103122,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -69119,14 +103133,14 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 @@ -69168,14 +103182,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69219,35 +103236,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 436 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 648 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id008 + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -69257,37 +103284,37 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3200 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -69306,9 +103333,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69316,15 +103343,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69368,35 +103398,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 437 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 649 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -69406,41 +103446,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69454,10 +103494,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69465,15 +103505,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69517,35 +103560,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 438 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 650 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id009 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -69555,41 +103608,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69603,10 +103656,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69614,15 +103667,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69666,35 +103722,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 439 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 651 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -69704,8 +103770,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -69715,30 +103781,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69752,10 +103818,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69763,15 +103829,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69815,48 +103884,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 440 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 652 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -69864,47 +103943,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69912,15 +103991,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69964,14 +104046,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 441 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 653 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -69981,39 +104070,42 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -70021,39 +104113,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70061,15 +104153,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70113,48 +104208,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 442 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 654 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -70162,47 +104267,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 8 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 - LVPA: 32 - LVPB: 32 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 512 + LdsOffsetB_Blk: 4608 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70210,15 +104315,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70262,14 +104370,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 443 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU08_SNLL0_TT04_04_VW04_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 655 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -70279,16 +104394,19 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -70301,57 +104419,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70359,15 +104477,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70411,48 +104532,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 444 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 656 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -70460,36 +104591,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 32 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -70497,9 +104628,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -70508,15 +104639,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70560,33 +104694,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 445 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 657 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -70599,7 +104743,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -70609,47 +104753,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCB: 16 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70657,15 +104801,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70709,33 +104856,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 446 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 658 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -70748,57 +104905,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70806,15 +104963,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70858,33 +105018,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 447 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 659 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -70897,57 +105067,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70955,15 +105125,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71007,33 +105180,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 448 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 660 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -71045,58 +105228,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71104,20 +105283,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -71156,33 +105338,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 449 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 661 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -71195,36 +105387,36 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -71234,18 +105426,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71253,15 +105445,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71305,33 +105500,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 450 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 662 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -71344,7 +105549,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -71354,12 +105559,12 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -71371,9 +105576,9 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -71383,18 +105588,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71402,15 +105607,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71454,33 +105662,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 451 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 663 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -71493,57 +105711,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71551,15 +105769,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71603,46 +105824,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 452 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 664 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -71652,36 +105883,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 576 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -71689,10 +105920,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71700,15 +105931,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71752,33 +105986,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 453 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 665 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_8_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -71791,7 +106035,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -71801,12 +106045,12 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -71818,26 +106062,26 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -71850,14 +106094,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71901,33 +106148,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 454 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 666 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -71940,57 +106197,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71998,15 +106255,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72050,33 +106310,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 455 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 667 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72089,57 +106359,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72147,15 +106417,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72199,33 +106472,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 456 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 668 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72237,58 +106520,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1600 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72296,20 +106575,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -72348,33 +106630,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 457 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 669 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id017 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72386,8 +106678,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -72397,12 +106689,12 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -72414,29 +106706,25 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72445,20 +106733,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -72497,33 +106788,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 458 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 670 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id017 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72536,9 +106837,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -72546,47 +106847,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72594,15 +106895,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72646,33 +106950,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 459 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 671 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id020 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72685,7 +106999,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -72695,47 +107009,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72743,15 +107057,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72795,33 +107112,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 460 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 672 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72834,9 +107161,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -72844,46 +107171,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 32 LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 2112 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72893,14 +107220,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72944,33 +107274,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 461 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 673 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72983,57 +107323,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73041,15 +107381,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -73093,96 +107436,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 462 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 674 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73190,15 +107543,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -73242,85 +107598,95 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 463 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 675 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x64x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -73328,10 +107694,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73339,15 +107705,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -73391,33 +107760,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 464 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 676 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -73430,57 +107809,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73488,15 +107867,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -73540,33 +107922,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 465 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 677 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -73579,7 +107971,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -73589,47 +107981,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73637,15 +108029,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -73689,33 +108084,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 466 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 678 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -73728,7 +108133,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -73738,47 +108143,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73786,15 +108191,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -73838,33 +108246,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 467 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 679 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -73876,58 +108294,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3136 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 + LdsOffsetB: 2112 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73935,20 +108349,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -73987,33 +108404,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 468 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 680 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -74026,57 +108453,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74084,15 +108511,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -74136,33 +108566,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 469 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 681 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -74175,57 +108615,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74233,15 +108673,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -74285,33 +108728,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 470 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 682 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -74323,10 +108776,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -74334,47 +108787,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2624 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetB: 2112 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74382,20 +108831,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -74434,33 +108886,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 471 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG16_08_02 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 683 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 - WorkGroupMapping: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -74473,57 +108935,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74531,15 +108993,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -74583,96 +109048,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 472 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 684 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 9344 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74680,15 +109155,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -74732,85 +109210,95 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 473 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 685 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3600 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -74818,10 +109306,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74829,15 +109317,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -74881,46 +109372,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 474 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 686 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id017 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -74930,47 +109431,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 2 - LVCB: 2 - LVPA: 16 - LVPB: 16 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74978,15 +109479,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -75030,46 +109534,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 475 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 687 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -75079,43 +109593,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 64 LSPB: 64 - LVCA: 2 - LVCB: 2 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 6176 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -75127,15 +109641,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -75179,46 +109696,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 476 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 688 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -75228,43 +109755,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 64 LSPB: 64 - LVCA: 2 - LVCB: 2 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 6176 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -75276,15 +109803,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -75328,46 +109858,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 477 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 689 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -75377,63 +109917,66 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 2 - LVCB: 2 - LVPA: 16 - LVPB: 16 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 MinGlobalWriteVectorWidth: 1 NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -75477,48 +110020,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 478 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 690 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -75526,47 +110079,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75574,15 +110127,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -75626,46 +110182,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 479 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 691 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -75675,46 +110241,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -75723,15 +110289,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -75775,46 +110344,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 480 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 692 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -75824,47 +110403,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3136 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75872,20 +110447,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -75924,46 +110502,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 481 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 693 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -75973,36 +110561,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 13312 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -76010,9 +110598,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -76023,13 +110611,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -76073,46 +110664,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 482 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 694 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -76122,47 +110723,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76170,15 +110771,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -76222,46 +110826,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 483 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 695 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -76271,47 +110885,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76319,15 +110933,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -76371,46 +110988,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 484 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 696 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [8, 32, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -76420,47 +111047,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 13376 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 4160 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 9216 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 256 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 256 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76468,15 +111095,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -76520,46 +111150,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 485 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 697 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_8_VW4_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [8, 32, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -76569,36 +111209,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -76607,9 +111247,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76617,15 +111257,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -76669,96 +111312,102 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 486 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 698 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2624 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76766,20 +111415,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -76818,46 +111470,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 487 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 699 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id020 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -76867,36 +111529,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -76905,9 +111567,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76915,15 +111577,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -76967,46 +111632,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 488 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 700 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -77016,63 +111691,66 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 MinGlobalWriteVectorWidth: 1 NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -77116,46 +111794,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 489 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 701 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -77165,46 +111853,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -77213,15 +111901,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -77265,47 +111956,57 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 490 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 702 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -77314,47 +112015,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 12864 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 4160 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 512 + LdsOffsetB_Blk: 8704 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77362,15 +112063,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -77414,46 +112118,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 491 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 703 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 8 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -77463,36 +112177,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -77501,9 +112215,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77511,15 +112225,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -77563,35 +112280,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 492 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_08_02 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 704 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 - WorkGroupMapping: 8 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -77601,8 +112328,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -77616,10 +112343,10 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 @@ -77629,15 +112356,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7232 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -77650,9 +112377,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77660,15 +112387,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -77712,35 +112442,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 493 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 705 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -77750,8 +112490,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -77765,10 +112505,10 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 @@ -77778,15 +112518,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -77798,10 +112538,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77811,13 +112551,16 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -77861,35 +112604,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 494 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id023 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 706 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -77899,43 +112652,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -77947,10 +112700,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77958,15 +112711,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -78010,46 +112766,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 495 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL0_TT06_08_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 707 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: [6, 8] - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id021 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -78059,32 +112825,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 9216 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78095,11 +112861,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78107,15 +112873,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -78159,46 +112928,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 496 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id024 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 708 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -78212,28 +112991,28 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 13440 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 8192 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 9216 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78244,11 +113023,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78258,13 +113037,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -78308,48 +113090,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 497 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 709 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -78360,29 +113152,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 32 + LVCA: 2 LVCB: 4 - LVPA: 16 + LVPA: 32 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3408 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78393,11 +113185,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78405,15 +113197,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -78457,35 +113252,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 498 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id023 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 710 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -78495,8 +113300,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -78506,32 +113311,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78543,10 +113348,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78554,15 +113359,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -78606,47 +113414,57 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 499 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 711 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id024 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -78655,32 +113473,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6752 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78691,11 +113509,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78704,14 +113522,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -78755,46 +113576,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 500 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 712 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 + VectorWidth: 2 + WorkGroup: [8, 32, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -78804,32 +113635,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78840,7 +113671,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -78853,14 +113684,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -78904,46 +113738,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 501 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 713 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -78953,32 +113797,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78989,10 +113833,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -79001,15 +113845,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -79053,48 +113900,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 502 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 714 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -79102,32 +113959,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6752 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -79138,11 +113995,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79150,15 +114007,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -79202,33 +114062,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 503 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 715 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -79241,7 +114111,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -79251,32 +114121,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -79289,9 +114159,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79300,14 +114170,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -79351,33 +114224,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 504 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 716 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id027 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -79389,8 +114272,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -79400,12 +114283,12 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -79417,15 +114300,11 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2144 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -79438,9 +114317,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79448,20 +114327,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -79500,33 +114382,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 505 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 717 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU5_LPA2_LPB2_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id028 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -79539,42 +114431,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -79586,10 +114478,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79597,15 +114489,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -79649,33 +114544,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 506 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 718 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id029 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id026 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -79688,7 +114593,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -79698,12 +114603,12 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -79715,15 +114620,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -79747,14 +114652,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -79798,33 +114706,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 507 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 719 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id030 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -79837,42 +114755,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -79885,9 +114803,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 96 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79895,15 +114813,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -79947,33 +114868,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 508 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id031 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 720 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id026 + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -79986,7 +114917,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -79999,29 +114930,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -80033,10 +114964,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80044,15 +114975,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -80096,33 +115030,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 509 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 721 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -80135,7 +115079,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -80148,9 +115092,9 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -80162,15 +115106,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -80194,14 +115138,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -80245,33 +115192,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 510 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 722 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id027 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -80284,7 +115241,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -80297,9 +115254,9 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -80311,15 +115268,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -80331,10 +115288,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80343,14 +115300,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -80394,33 +115354,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 511 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 723 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id028 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -80433,42 +115403,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -80480,9 +115450,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 96 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -80491,15 +115461,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -80543,33 +115516,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 512 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id029 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 724 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id026 + WorkGroup: [8, 32, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -80582,8 +115565,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -80592,32 +115575,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6752 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -80629,10 +115612,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80640,15 +115623,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -80692,33 +115678,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 513 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id030 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 725 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 + VectorWidth: 2 + WorkGroup: [8, 32, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -80731,42 +115727,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -80778,10 +115774,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80789,15 +115785,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -80841,33 +115840,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 514 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id031 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 726 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id026 + WorkGroup: [8, 32, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -80880,7 +115889,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -80890,12 +115899,12 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -80907,15 +115916,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -80927,10 +115936,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80938,15 +115947,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -80990,46 +116002,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 515 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 727 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -81039,32 +116061,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 3072 - LdsNumElementsAlignedB: 3072 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 3072 - LdsOffsetB_Blk: 11264 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81075,10 +116097,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -81087,15 +116109,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -81139,46 +116164,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 516 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 728 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -81191,29 +116226,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 3072 - LdsNumElementsAlignedB: 3072 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 3072 - LdsOffsetB_Blk: 11264 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81224,11 +116259,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81236,15 +116271,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -81288,46 +116326,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 517 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 729 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -81340,29 +116388,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81373,11 +116421,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81385,15 +116433,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -81437,46 +116488,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 518 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 730 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_8_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id027 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -81486,32 +116547,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81522,7 +116583,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 128 MacroTileA: 64 @@ -81535,14 +116596,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -81586,46 +116650,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 519 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 731 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id028 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -81635,32 +116709,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81671,11 +116745,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81683,15 +116757,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -81735,46 +116812,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 520 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 732 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -81784,32 +116871,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81820,11 +116907,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81833,14 +116920,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -81884,46 +116974,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 521 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 733 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id027 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -81933,32 +117033,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81969,11 +117069,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81981,15 +117081,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -82033,33 +117136,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 522 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 734 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id028 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -82072,7 +117185,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -82082,12 +117195,12 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 @@ -82099,15 +117212,15 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 4160 LdsOffsetA: 0 LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 9280 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -82119,9 +117232,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 32 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -82130,15 +117243,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -82182,55 +117298,65 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 523 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 735 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -82238,25 +117364,25 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 4 - LSCB: 4 - LSPA: 16 - LSPB: 16 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 14464 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 4160 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 10304 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -82267,11 +117393,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82279,15 +117405,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -82331,48 +117460,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 524 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 736 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id032 - WorkGroupMapping: 1 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 2 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -82380,32 +117519,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 2 - LSCB: 2 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 10304 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -82416,11 +117555,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82429,14 +117568,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -82480,96 +117622,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 525 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 737 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id032 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 4 + LVPA: 32 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3424 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82577,15 +117729,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -82629,33 +117784,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 526 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 738 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id035 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -82668,57 +117833,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82726,15 +117891,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -82778,96 +117946,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 527 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 739 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82875,15 +118053,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -82927,96 +118108,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 528 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 740 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83024,15 +118215,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83076,96 +118270,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 529 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 741 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 + VectorWidth: 4 + WorkGroup: [8, 32, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83173,15 +118377,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83225,33 +118432,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 530 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 742 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -83264,57 +118481,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83322,15 +118539,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83374,32 +118594,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 531 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 743 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id035 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: false + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -83411,56 +118642,57 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1088 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -83469,13 +118701,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83519,13 +118756,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 532 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_08_02_WGM01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 744 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 - ThreadTile: *id036 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -83535,17 +118780,19 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id038 + VectorWidth: 4 + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -83557,57 +118804,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83615,13 +118863,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83665,95 +118918,102 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 533 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - ThreadTile: *id036 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 745 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id037 + VectorWidth: 4 + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6752 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4736 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetB: 4160 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83761,18 +119021,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -83811,95 +119076,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 534 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - ThreadTile: *id036 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 746 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT8_4_VW4_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id037 + VectorWidth: 4 + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6752 + LdsNumElements: 7296 LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2112 LdsOffsetB_Blk: 6208 - LdsPadA: 1 - LdsPadB: 1 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83907,13 +119183,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83957,91 +119238,102 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 535 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM08 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 747 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA4_LPB4_PGR1_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - ThreadTile: *id036 - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id037 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -84053,13 +119345,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -84103,95 +119400,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 536 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW01_GSU01_LPA02_LPB02_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_04_04_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 748 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 - ThreadTile: *id039 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id037 + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3456 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -84199,13 +119507,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -84249,33 +119562,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 537 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW01_GSU08_LPA02_LPB02_PGR1_PLR1_TT02_02_USFGRO01_VW02_WG16_04_04_WGM01 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 749 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id037 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -84287,16 +119610,17 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -84307,37 +119631,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6272 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 + LdsOffsetB_Blk: 5184 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -84345,13 +119669,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -84395,13 +119724,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 538 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_08_02_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 750 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id036 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -84412,16 +119749,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id038 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -84433,16 +119772,17 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -84453,20 +119793,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -84480,10 +119820,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -84495,9 +119835,14 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -84541,13 +119886,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 539 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG32_08_01_WGM01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id039 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 751 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 @@ -84558,16 +119911,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id040 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -84579,6 +119934,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -84593,8 +119949,8 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -84644,6 +120000,11 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -84687,26 +120048,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 540 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id039 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 752 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id040 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -84740,7 +120110,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -84754,15 +120124,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 10304 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -84786,7 +120156,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -84844,8 +120214,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 541 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 753 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84864,9 +120234,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -84888,10 +120258,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -84899,32 +120269,28 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 32 LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2688 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -84936,9 +120302,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -84948,12 +120314,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84963,7 +120329,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -85006,14 +120372,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 542 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 754 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -85026,9 +120392,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85053,7 +120419,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -85061,10 +120427,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -85072,21 +120438,21 @@ LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85098,10 +120464,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85110,10 +120476,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -85168,16 +120534,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 543 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 755 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -85188,9 +120554,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85223,10 +120589,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -85240,15 +120606,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85261,9 +120627,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85271,12 +120637,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -85330,29 +120696,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 544 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 756 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85388,29 +120754,29 @@ GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85423,9 +120789,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85434,12 +120800,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85492,15 +120858,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 545 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 757 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -85512,9 +120878,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85547,32 +120913,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85584,10 +120950,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85595,13 +120961,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85654,29 +121020,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 546 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 758 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85700,7 +121066,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -85709,32 +121075,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85746,10 +121112,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85758,11 +121124,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -85816,16 +121182,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 547 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 759 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -85836,9 +121202,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85871,10 +121237,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -85888,15 +121254,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85909,9 +121275,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85919,12 +121285,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -85978,28 +121344,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 548 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 760 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 32, 1] WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -86024,7 +121390,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -86033,32 +121399,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 64 - LVCA: 8 + LVCA: 4 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 512 - LdsOffsetB_Blk: 4608 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86070,10 +121436,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86082,11 +121448,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -86140,16 +121506,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 549 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM64 + SolutionIndex: 761 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -86160,8 +121526,8 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -86195,10 +121561,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -86212,15 +121578,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86233,9 +121599,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86243,12 +121609,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -86302,8 +121668,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 550 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 762 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86311,18 +121677,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 64 WorkGroupMappingType: B @@ -86340,7 +121706,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -86349,7 +121715,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -86365,24 +121731,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 32 - LVCA: 2 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 LVCB: 4 - LVPA: 32 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86393,11 +121759,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86405,11 +121771,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -86464,20 +121830,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 551 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 + SolutionIndex: 763 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -86485,10 +121851,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -86536,15 +121902,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 10304 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86626,8 +121992,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 552 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 764 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86648,7 +122014,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -86673,7 +122039,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -86681,7 +122047,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -86692,21 +122058,21 @@ LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86718,10 +122084,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86733,7 +122099,7 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -86788,16 +122154,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 553 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 765 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -86809,8 +122175,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -86860,15 +122226,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86881,9 +122247,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86891,12 +122257,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -86950,20 +122316,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 554 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 766 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -86971,8 +122337,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -86994,9 +122360,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -87005,7 +122371,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -87015,18 +122381,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 2048 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87038,10 +122408,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87049,12 +122419,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -87065,7 +122435,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -87108,29 +122478,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 555 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 767 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -87146,7 +122516,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -87171,24 +122541,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 13568 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 9344 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87199,11 +122569,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87215,9 +122585,9 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -87270,15 +122640,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 556 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 768 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -87291,10 +122661,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -87308,7 +122678,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -87333,24 +122703,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87361,11 +122731,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87373,8 +122743,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -87432,8 +122802,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 557 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 769 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87442,11 +122812,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -87454,9 +122824,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -87470,7 +122840,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -87487,7 +122857,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -87495,24 +122865,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 13568 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 9344 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87523,11 +122893,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87535,12 +122905,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -87594,31 +122964,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 558 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + SolutionIndex: 770 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -87632,7 +123002,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -87652,29 +123022,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 2 - LVCB: 2 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 512 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 576 - LdsOffsetB_Blk: 2624 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87685,7 +123055,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -87697,13 +123067,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -87756,31 +123126,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 559 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_8_VW2_WG16_8_1_WGM8 + SolutionIndex: 771 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -87794,7 +123164,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -87811,32 +123181,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 512 + LdsNumElements: 14592 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87847,11 +123217,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87859,13 +123229,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -87918,31 +123288,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 560 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 772 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -87956,7 +123326,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -87965,7 +123335,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -87973,32 +123343,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 32 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 4 + LVCA: 8 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 512 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -88009,11 +123379,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88022,11 +123392,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -88080,16 +123450,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 561 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 + SolutionIndex: 773 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -88100,11 +123470,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -88118,7 +123488,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88135,32 +123505,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 14592 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -88171,11 +123541,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88183,12 +123553,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -88242,8 +123612,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 562 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 774 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88251,22 +123621,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -88286,10 +123656,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -88307,17 +123677,21 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 32 LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1600 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -88341,13 +123715,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -88357,7 +123731,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -88373,6 +123747,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88382,6 +123757,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88400,29 +123776,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 563 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 + SolutionIndex: 775 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -88444,7 +123820,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -88472,10 +123848,14 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -88515,7 +123895,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -88531,6 +123911,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88540,6 +123921,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88558,8 +123940,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 564 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 776 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88567,7 +123949,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -88580,7 +123962,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -88596,7 +123978,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88605,7 +123987,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -88613,47 +123995,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88661,11 +124043,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -88693,6 +124075,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88702,6 +124085,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88720,31 +124104,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 565 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 + SolutionIndex: 777 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -88758,7 +124142,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88775,7 +124159,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -88783,23 +124167,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -88811,11 +124195,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88829,7 +124213,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -88855,6 +124239,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88864,6 +124249,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88882,16 +124268,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 566 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 778 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -88903,10 +124289,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -88947,37 +124333,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 32 + LSPA: 32 + LSPB: 16 LVCA: 4 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 512 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88985,13 +124371,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89017,6 +124403,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89026,6 +124413,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89044,20 +124432,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 567 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 + SolutionIndex: 779 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW2_WG4_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -89065,8 +124453,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -89082,7 +124470,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89091,7 +124479,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -89099,47 +124487,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89147,11 +124535,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -89179,6 +124567,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89188,6 +124577,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89206,31 +124596,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 568 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + SolutionIndex: 780 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -89244,7 +124634,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89264,44 +124654,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 256 - MacroTile1: 64 - MacroTileA: 256 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89309,13 +124699,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89341,6 +124731,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89350,6 +124741,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89368,31 +124760,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 569 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x64x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG32_8_1_WGM1 + SolutionIndex: 781 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -89426,28 +124818,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 128 - LSPB: 128 + LSPA: 32 + LSPB: 32 LVCA: 2 LVCB: 2 - LVPA: 32 - LVPB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -89460,10 +124852,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89471,13 +124863,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89503,6 +124895,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89512,6 +124905,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89530,29 +124924,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 570 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 782 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -89568,7 +124962,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89577,7 +124971,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -89585,47 +124979,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89633,13 +125027,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89665,6 +125059,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89674,6 +125069,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89692,31 +125088,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 571 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 783 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -89730,7 +125126,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89747,47 +125143,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89795,13 +125191,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89827,6 +125223,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89836,6 +125233,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89854,31 +125252,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 572 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 784 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT8_4_VW2_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -89892,7 +125290,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89901,7 +125299,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -89917,39 +125315,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 8 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89957,13 +125355,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89989,6 +125387,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89998,6 +125397,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90016,20 +125416,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 573 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 785 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 2 + SubGroupA: 8 + SubGroupB: 2 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -90037,10 +125437,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 2, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -90054,16 +125454,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -90079,19 +125479,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2112 - LdsPadA: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -90103,7 +125507,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -90119,7 +125523,7 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -90131,7 +125535,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -90147,6 +125551,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90156,6 +125561,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90174,31 +125580,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 574 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 786 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -90212,7 +125618,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -90237,23 +125643,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -90265,11 +125671,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90277,13 +125683,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -90309,6 +125715,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90318,6 +125725,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90336,31 +125744,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 575 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 787 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -90401,37 +125809,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 16 + LSPB: 16 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90439,13 +125847,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularB: 2 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -90471,6 +125879,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90480,6 +125889,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90498,20 +125908,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 576 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 788 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 8 + SubGroupA: 2 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -90519,8 +125929,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroup: [2, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -90545,7 +125955,7 @@ ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -90564,16 +125974,16 @@ LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 32 + LSPB: 64 LVCA: 4 - LVCB: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2624 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 2112 - LdsPadA: 4 + LdsOffsetB: 1024 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -90586,10 +125996,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90601,7 +126011,7 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -90629,6 +126039,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90638,6 +126049,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90656,15 +126068,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 577 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM64 + SolutionIndex: 789 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -90677,8 +126089,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -90694,7 +126106,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -90711,47 +126123,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90759,11 +126171,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -90791,6 +126203,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90800,6 +126213,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90818,16 +126232,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 578 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 790 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -90839,10 +126253,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -90856,7 +126270,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -90865,7 +126279,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -90881,39 +126295,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 8 + LSCB: 8 LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPB: 8 + LVCA: 2 + LVCB: 4 LVPA: 8 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsPadA: 4 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 2 MacroTile0: 32 - MacroTile1: 128 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 128 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90921,13 +126335,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -90953,6 +126367,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90962,6 +126377,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90980,15 +126396,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 579 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 791 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 2 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 2 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -91001,10 +126417,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [8, 2, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -91027,7 +126443,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -91038,29 +126454,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 128 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LVCB: 2 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3600 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -91072,10 +126488,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91083,13 +126499,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -91115,6 +126531,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91124,6 +126541,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91142,28 +126560,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 580 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 792 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -91180,7 +126598,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -91197,7 +126615,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -91205,39 +126623,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91245,11 +126663,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -91277,6 +126695,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91286,6 +126705,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91304,15 +126724,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 581 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 793 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT4_4_VW4_WG4_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -91324,11 +126744,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -91359,10 +126779,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -91376,15 +126796,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6176 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -91408,7 +126828,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -91439,6 +126859,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91448,6 +126869,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91466,8 +126888,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 582 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 794 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91475,7 +126897,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -91486,9 +126908,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -91504,7 +126926,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -91521,32 +126943,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6176 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -91557,11 +126979,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91570,12 +126992,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -91601,6 +127023,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91610,6 +127033,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91628,16 +127052,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 583 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 795 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -91648,11 +127072,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 8, 1] WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -91683,46 +127107,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 16 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -91731,13 +127155,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -91763,6 +127187,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91772,6 +127197,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91790,28 +127216,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 584 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + SolutionIndex: 796 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 2 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 2 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [2, 8, 4] WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -91828,7 +127254,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -91853,39 +127279,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91893,13 +127319,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -91925,6 +127351,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91934,6 +127361,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91952,31 +127380,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 585 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 797 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -91990,7 +127418,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -92010,44 +127438,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92055,11 +127483,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -92087,6 +127515,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92096,6 +127525,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92114,15 +127544,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 586 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 798 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -92135,10 +127565,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -92158,7 +127588,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -92172,7 +127602,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -92186,11 +127616,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92203,9 +127637,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92213,12 +127647,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -92229,13 +127663,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92245,6 +127680,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92254,6 +127690,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92272,8 +127709,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 587 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 799 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92281,18 +127718,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 4] + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -92317,7 +127754,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -92327,32 +127764,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92364,10 +127801,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92375,13 +127812,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92398,6 +127837,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92407,6 +127847,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92416,6 +127857,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92434,33 +127876,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 588 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 800 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92479,7 +127919,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -92489,10 +127929,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -92506,15 +127946,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92526,10 +127966,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92537,12 +127977,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -92560,6 +128002,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92569,6 +128012,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92578,6 +128022,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92596,8 +128041,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 589 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 801 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92605,24 +128050,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92654,7 +128097,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -92668,15 +128111,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92688,9 +128131,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -92699,11 +128142,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -92722,6 +128165,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92731,6 +128175,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92740,6 +128185,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92758,28 +128204,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 590 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 + SolutionIndex: 802 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -92816,7 +128262,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -92830,15 +128276,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13376 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 4160 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 9216 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92850,10 +128296,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92862,11 +128308,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -92884,6 +128330,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92893,6 +128340,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92902,6 +128350,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92920,15 +128369,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 591 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_8_VW4_WG8_32_1_WGM8 + SolutionIndex: 803 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [8, 8] ThreadTile0: 8 @@ -92940,8 +128389,8 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -92965,7 +128414,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -92992,15 +128441,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93012,9 +128461,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -93023,11 +128472,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -93046,6 +128497,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93055,6 +128507,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -93064,6 +128517,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -93082,8 +128536,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 592 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 804 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -93092,23 +128546,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93126,10 +128578,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -93148,17 +128600,21 @@ LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 32 + LSPB: 64 LVCA: 4 - LVCB: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2624 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93171,9 +128627,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93181,12 +128637,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -93197,13 +128655,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93213,6 +128672,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -93222,6 +128682,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -93240,33 +128701,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 593 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 + SolutionIndex: 805 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93295,7 +128754,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -93305,22 +128764,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93332,10 +128791,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93343,13 +128802,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -93366,6 +128825,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93375,6 +128835,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -93384,6 +128845,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -93402,29 +128864,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 594 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 + SolutionIndex: 806 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -93457,7 +128919,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -93474,15 +128936,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93494,10 +128956,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93505,12 +128967,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -93528,6 +128990,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93537,6 +129000,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -93546,6 +129010,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -93564,8 +129029,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 595 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 807 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -93573,12 +129038,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -93586,7 +129051,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -93609,7 +129074,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -93629,22 +129094,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93656,10 +129121,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93667,13 +129132,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -93690,6 +129157,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93699,6 +129167,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -93708,6 +129177,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -93726,33 +129196,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 596 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 808 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93771,8 +129239,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -93791,22 +129259,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 64 - LVCA: 8 + LVCA: 4 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12864 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 4160 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 512 - LdsOffsetB_Blk: 8704 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93818,10 +129286,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93829,12 +129297,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -93852,6 +129322,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93861,6 +129332,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -93870,6 +129342,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -93888,20 +129361,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 597 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG8_32_1_WGM64 + SolutionIndex: 809 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -93909,12 +129382,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93960,15 +129431,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93980,9 +129451,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -93991,11 +129462,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -94014,6 +129485,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94023,6 +129495,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -94032,6 +129505,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -94050,29 +129524,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 598 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 810 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -94122,15 +129596,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -94142,9 +129616,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -94153,11 +129627,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -94176,6 +129650,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94185,6 +129660,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -94194,6 +129670,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -94212,8 +129689,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 599 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 811 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -94222,10 +129699,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -94234,7 +129711,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -94257,16 +129734,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -94277,20 +129754,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -94304,10 +129781,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94317,10 +129794,12 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -94338,6 +129817,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94347,6 +129827,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -94356,6 +129837,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -94374,15 +129856,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 600 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + SolutionIndex: 812 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 @@ -94390,17 +129872,15 @@ ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94419,17 +129899,17 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -94441,18 +129921,18 @@ LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 7232 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -94467,9 +129947,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94477,13 +129957,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -94500,6 +129982,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94509,6 +129992,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -94518,6 +130002,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -94536,33 +130021,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 601 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 813 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94574,7 +130057,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -94582,16 +130065,16 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -94599,22 +130082,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13440 + LdsNumElements: 7232 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 9216 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -94627,10 +130110,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -94639,11 +130122,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 @@ -94662,6 +130145,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94671,6 +130155,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -94680,6 +130165,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -94698,31 +130184,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 602 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM8 + SolutionIndex: 814 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -94736,7 +130222,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -94744,15 +130230,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -94761,22 +130247,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13440 + LdsNumElements: 7232 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 9216 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -94789,10 +130275,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -94801,11 +130287,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 @@ -94824,6 +130310,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94833,6 +130320,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -94842,6 +130330,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -94860,31 +130349,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 603 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 815 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -94898,7 +130387,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -94907,7 +130396,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94923,23 +130412,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 32 - LVCA: 2 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 LVCB: 4 - LVPA: 32 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3408 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElements: 7200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true @@ -94951,11 +130440,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94963,12 +130452,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -94986,6 +130475,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94995,6 +130485,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -95004,6 +130495,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -95022,31 +130514,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 604 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 + SolutionIndex: 816 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -95067,17 +130559,17 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -95087,20 +130579,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3680 + LdsNumElements: 7264 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 + LdsOffsetB_Blk: 5184 LdsPadA: 2 LdsPadB: 2 LocalDotLayout: 1 @@ -95115,9 +130607,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95125,13 +130617,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -95148,6 +130642,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95157,6 +130652,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -95166,6 +130662,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -95184,33 +130681,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 605 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 817 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95229,42 +130724,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 64 - LVCA: 8 - LVCB: 4 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6752 - LdsNumElementsAlignedA: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -95276,9 +130771,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -95287,12 +130782,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -95310,6 +130807,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95319,6 +130817,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -95328,6 +130827,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -95346,33 +130846,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 606 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 + SolutionIndex: 818 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95391,42 +130889,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6240 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -95439,9 +130937,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95449,12 +130947,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -95472,6 +130972,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95481,6 +130982,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -95490,6 +130992,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -95508,8 +131011,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 607 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 819 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -95517,24 +131020,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95553,17 +131054,17 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -95573,10 +131074,10 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false @@ -95613,10 +131114,12 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -95634,6 +131137,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95643,6 +131147,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -95652,6 +131157,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -95670,8 +131176,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 608 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 820 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -95679,14 +131185,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -95695,8 +131201,6 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95715,8 +131219,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -95724,8 +131228,8 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -95735,20 +131239,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 32 - LVCA: 4 + LVCA: 8 LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6752 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 2 LdsPadB: 2 LocalDotLayout: 1 @@ -95762,10 +131266,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95773,12 +131277,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -95796,6 +131302,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95805,6 +131312,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -95814,6 +131322,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -95832,33 +131341,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 609 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 + SolutionIndex: 821 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95878,16 +131385,16 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -95899,18 +131406,18 @@ LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3680 + LdsNumElements: 7264 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 + LdsOffsetB_Blk: 5184 LdsPadA: 2 LdsPadB: 2 LocalDotLayout: 1 @@ -95925,9 +131432,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95935,13 +131442,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -95958,6 +131465,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95967,6 +131475,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -95976,6 +131485,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -95994,28 +131504,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 610 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 822 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -96038,18 +131548,18 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -96059,16 +131569,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2144 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 2 LdsPadB: 2 LocalDotLayout: 1 @@ -96083,9 +131597,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96093,12 +131607,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -96109,13 +131623,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96125,6 +131640,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -96134,6 +131650,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -96152,8 +131669,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 611 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU5_LPA2_LPB2_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 823 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -96161,12 +131678,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -96197,42 +131714,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6240 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -96245,9 +131762,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96255,12 +131772,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -96278,6 +131797,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96287,6 +131807,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -96296,6 +131817,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -96314,8 +131836,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 612 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 824 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -96323,24 +131845,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -96359,42 +131879,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -96406,10 +131926,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96418,11 +131938,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -96440,6 +131962,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96449,6 +131972,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -96458,6 +131982,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -96476,8 +132001,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 613 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 825 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -96485,24 +132010,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -96522,41 +132045,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -96568,10 +132091,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96580,11 +132103,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -96602,6 +132125,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96611,6 +132135,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -96620,6 +132145,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -96638,28 +132164,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 614 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG32_8_1_WGM1 + SolutionIndex: 826 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -96684,19 +132210,19 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -96705,20 +132231,20 @@ LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3680 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -96731,9 +132257,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96741,13 +132267,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -96764,6 +132290,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96773,6 +132300,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -96782,6 +132310,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -96800,29 +132329,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 615 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_1_WGM8 + SolutionIndex: 827 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -96845,42 +132374,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -96892,9 +132421,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -96903,12 +132432,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -96926,6 +132457,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96935,6 +132467,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -96944,6 +132477,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -96962,8 +132496,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 616 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 828 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -96972,23 +132506,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -97007,42 +132539,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 + LdsNumElements: 7296 LdsNumElementsAlignedA: 2112 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2112 LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -97066,11 +132598,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -97088,6 +132622,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -97097,6 +132632,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -97106,6 +132642,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -97124,8 +132661,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 617 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 829 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -97144,13 +132681,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -97170,41 +132705,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -97216,10 +132751,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97228,10 +132763,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -97250,6 +132785,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -97259,6 +132795,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -97268,6 +132805,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -97286,29 +132824,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 618 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 + SolutionIndex: 830 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -97331,20 +132869,20 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -97352,21 +132890,21 @@ LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 64 + LSPB: 32 LVCA: 8 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6752 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -97378,10 +132916,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97389,11 +132927,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -97412,6 +132952,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -97421,6 +132962,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -97430,6 +132972,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -97448,33 +132991,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 619 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM8 + SolutionIndex: 831 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -97494,41 +133035,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -97540,10 +133081,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97552,10 +133093,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -97574,6 +133115,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -97583,6 +133125,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -97592,6 +133135,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -97610,16 +133154,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 620 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 + SolutionIndex: 832 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -97630,8 +133174,8 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -97655,7 +133199,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -97665,10 +133209,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -97682,15 +133226,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6240 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -97703,9 +133247,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97713,12 +133257,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -97736,6 +133282,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -97745,6 +133292,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -97754,6 +133302,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -97772,8 +133321,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 621 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 833 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -97781,24 +133330,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -97827,10 +133374,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -97844,15 +133391,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -97876,7 +133423,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -97898,6 +133445,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -97907,6 +133455,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -97916,6 +133465,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -97934,8 +133484,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 622 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM8 + SolutionIndex: 834 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -97943,7 +133493,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 @@ -97954,9 +133504,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -97978,7 +133528,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -97992,29 +133542,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -98026,10 +133572,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98037,13 +133583,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98053,13 +133599,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -98069,6 +133616,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -98078,6 +133626,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -98096,29 +133645,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 623 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 835 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB0_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -98140,43 +133689,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -98188,10 +133733,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98199,13 +133744,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98215,13 +133762,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -98231,6 +133779,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -98240,6 +133789,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -98258,33 +133808,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 624 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_8_VW2_WG16_16_1_WGM64 + SolutionIndex: 836 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -98302,43 +133850,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -98350,10 +133894,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98361,13 +133905,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98377,13 +133923,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -98393,6 +133940,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -98402,6 +133950,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -98420,33 +133969,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 625 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM64 + SolutionIndex: 837 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -98464,8 +134011,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -98475,10 +134022,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -98492,15 +134039,11 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2128 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1088 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -98512,9 +134055,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -98523,11 +134066,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -98539,13 +134084,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -98555,6 +134101,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -98564,6 +134111,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -98582,8 +134130,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 626 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 838 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98592,23 +134140,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -98626,7 +134172,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -98637,32 +134183,28 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3680 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2128 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -98675,9 +134217,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98686,12 +134228,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98701,13 +134243,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -98717,6 +134260,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -98726,6 +134270,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -98744,15 +134289,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 627 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + SolutionIndex: 839 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -98764,9 +134309,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -98788,8 +134333,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -98799,32 +134344,28 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -98836,10 +134377,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98847,13 +134388,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98863,13 +134406,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -98879,6 +134423,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -98888,6 +134433,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -98906,33 +134452,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 628 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 840 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -98944,13 +134488,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -98961,32 +134505,28 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 4160 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 9280 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -98997,11 +134537,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99009,13 +134549,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99025,13 +134565,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99041,6 +134582,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -99050,6 +134592,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -99068,31 +134611,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 629 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 + SolutionIndex: 841 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -99106,14 +134649,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -99126,29 +134669,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 14464 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 4160 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 10304 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99159,11 +134698,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99171,13 +134710,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99187,13 +134728,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99203,6 +134745,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -99212,6 +134755,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -99230,33 +134774,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 630 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG8_32_1_WGM8 + SolutionIndex: 842 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -99268,13 +134810,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -99285,32 +134827,28 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 12416 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 10304 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99321,11 +134859,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99333,13 +134871,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99349,13 +134887,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99365,6 +134904,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -99374,6 +134914,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -99392,31 +134933,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 631 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 843 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -99430,16 +134971,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -99450,29 +134991,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 32 - LVCA: 2 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 LVCB: 4 - LVPA: 32 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3424 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElements: 2128 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99483,11 +135020,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99496,7 +135033,9 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -99511,13 +135050,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99527,6 +135067,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -99536,6 +135077,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -99554,16 +135096,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 632 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 + SolutionIndex: 844 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -99574,13 +135116,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -99592,16 +135132,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -99612,29 +135152,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 + LSCA: 16 + LSCB: 16 + LSPA: 64 LSPB: 64 - LVCA: 2 + LVCA: 4 LVCB: 4 - LVPA: 32 - LVPB: 32 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3680 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2128 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99645,10 +135181,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -99657,8 +135193,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -99673,13 +135209,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99689,6 +135226,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -99698,6 +135236,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -99716,31 +135255,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 633 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 845 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -99760,8 +135299,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -99774,7 +135313,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -99788,15 +135327,11 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99808,9 +135343,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -99819,11 +135354,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -99835,13 +135372,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -99851,6 +135389,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -99860,6 +135399,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -99878,8 +135418,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 634 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 846 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -99887,24 +135427,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -99922,9 +135460,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -99936,7 +135474,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -99944,21 +135482,17 @@ LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 64 - LVCA: 8 + LSPB: 32 + LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 - LdsPadA: 4 - LdsPadB: 4 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99971,9 +135505,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99981,13 +135515,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99997,13 +135531,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100013,6 +135548,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -100022,6 +135558,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -100040,29 +135577,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 635 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 + SolutionIndex: 847 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -100084,8 +135621,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -100098,29 +135635,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -100132,10 +135665,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100143,13 +135676,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100159,13 +135694,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100175,6 +135711,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -100184,6 +135721,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -100202,33 +135740,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 636 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 848 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -100240,14 +135776,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -100260,29 +135796,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2144 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1088 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -100293,11 +135825,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100305,13 +135837,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100321,13 +135855,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100337,6 +135872,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -100346,6 +135882,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -100364,33 +135901,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 637 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 849 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -100402,13 +135937,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 @@ -100427,22 +135962,18 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 32 - LVCA: 4 - LVCB: 8 + LVCA: 2 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 864 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 576 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -100455,10 +135986,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -100471,9 +136002,9 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100483,13 +136014,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100499,6 +136031,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -100508,6 +136041,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -100526,16 +136060,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 638 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 + SolutionIndex: 850 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -100547,10 +136081,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -100570,8 +136104,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -100591,20 +136125,16 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1664 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 1088 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -100618,10 +136148,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100629,13 +136159,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100645,13 +136177,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100661,6 +136194,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -100670,6 +136204,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -100688,33 +136223,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 639 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM1 + SolutionIndex: 851 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -100735,7 +136268,7 @@ ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -100753,16 +136286,16 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 32 LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4736 + LdsNumElements: 1664 LdsOffsetA: 0 - LdsOffsetB: 4160 + LdsOffsetB: 1088 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -100776,9 +136309,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 256 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 256 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -100787,13 +136320,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100810,6 +136343,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100819,6 +136353,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -100828,6 +136363,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -100846,20 +136382,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 640 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT8_4_VW4_WG32_8_1_WGM1 + SolutionIndex: 852 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -100867,7 +136403,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -100890,7 +136426,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -100901,7 +136437,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -100911,20 +136447,16 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1664 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 576 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -100938,9 +136470,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -100949,13 +136481,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100965,13 +136497,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -100981,6 +136514,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -100990,6 +136524,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -101008,20 +136543,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 641 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA4_LPB4_PGR1_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 853 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -101029,7 +136564,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -101052,7 +136587,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -101063,7 +136598,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -101080,13 +136615,9 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1664 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -101127,13 +136658,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101143,6 +136675,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -101152,6 +136685,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -101170,8 +136704,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 642 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 854 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101192,7 +136726,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -101200,7 +136734,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101228,10 +136762,11 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -101242,15 +136777,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101262,10 +136797,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101273,12 +136808,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -101289,6 +136824,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101296,6 +136832,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101305,6 +136842,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -101314,6 +136852,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -101332,37 +136871,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 643 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 + SolutionIndex: 855 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT6_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 + fractionalPerpOverhangA: 32 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101377,16 +136916,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -101394,25 +136933,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101424,10 +136964,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101435,12 +136975,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -101451,6 +136993,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101458,6 +137001,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101467,6 +137011,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -101476,6 +137021,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -101494,8 +137040,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 644 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 856 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101503,12 +137049,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -101519,12 +137065,10 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101539,7 +137083,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -101556,6 +137100,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -101566,15 +137111,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101586,9 +137131,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -101597,11 +137142,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -101613,13 +137160,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101629,6 +137178,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -101638,6 +137188,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -101656,8 +137207,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 645 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 857 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101665,11 +137216,11 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -101681,12 +137232,10 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101703,7 +137252,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -101714,44 +137263,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101759,11 +137309,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -101775,6 +137325,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101782,6 +137333,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101791,6 +137343,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -101800,6 +137353,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -101818,28 +137372,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 646 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 858 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -101848,7 +137402,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101864,7 +137418,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -101880,40 +137434,41 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 16 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 - LVPA: 16 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12416 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 10304 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101921,12 +137476,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -101937,6 +137492,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101944,6 +137500,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -101953,6 +137510,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -101962,6 +137520,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -101980,28 +137539,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 647 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 859 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -102010,7 +137569,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102018,15 +137577,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -102034,7 +137593,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -102042,21 +137601,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 32 + LSPB: 64 LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2688 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2112 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102067,11 +137631,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102081,11 +137645,14 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102095,13 +137662,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102111,6 +137680,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -102120,6 +137690,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -102138,15 +137709,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 648 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 + SolutionIndex: 860 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -102159,16 +137730,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102176,15 +137745,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -102192,7 +137761,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -102200,23 +137769,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 32 + LSPB: 64 LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2112 + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -102229,11 +137799,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102243,11 +137813,14 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102257,6 +137830,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102264,6 +137838,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102273,6 +137848,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -102282,6 +137858,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -102300,16 +137877,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 649 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 + SolutionIndex: 861 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -102321,16 +137898,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102344,8 +137919,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -102358,10 +137933,11 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -102372,15 +137948,11 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2144 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1088 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102392,9 +137964,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -102403,13 +137975,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102419,13 +137994,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102435,6 +138012,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -102444,6 +138022,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -102462,37 +138041,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 650 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 862 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102506,8 +138083,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -102517,30 +138094,27 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -102555,9 +138129,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102567,11 +138141,14 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102581,13 +138158,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102597,6 +138176,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -102606,6 +138186,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -102624,15 +138205,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 651 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 + SolutionIndex: 863 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -102645,16 +138226,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102686,25 +138265,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102717,9 +138297,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102731,9 +138311,10 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102743,6 +138324,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102750,6 +138332,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102759,6 +138342,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -102768,6 +138352,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -102786,16 +138371,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 652 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 864 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -102807,8 +138392,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -102816,7 +138401,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102830,9 +138415,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -102848,23 +138433,20 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 64 - LVCA: 8 + LVCA: 4 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 + LdsOffsetB: 1088 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -102878,10 +138460,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102894,8 +138476,9 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102905,13 +138488,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -102921,6 +138506,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -102930,6 +138516,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -102948,16 +138535,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 653 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 865 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -102969,8 +138556,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -102978,7 +138565,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102995,7 +138582,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -103010,40 +138597,41 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103051,13 +138639,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103067,6 +138656,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -103074,6 +138664,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -103083,6 +138674,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -103092,6 +138684,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -103110,20 +138703,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 654 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 866 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -103131,8 +138724,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -103140,7 +138733,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103148,64 +138741,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1088 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103213,13 +138807,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103229,6 +138826,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -103236,6 +138834,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -103245,6 +138844,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -103254,6 +138854,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -103272,37 +138873,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 655 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 867 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_LPA0_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103310,15 +138909,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -103330,44 +138929,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103375,13 +138975,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103391,6 +138994,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -103398,6 +139002,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -103407,6 +139012,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -103416,6 +139022,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -103434,37 +139041,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 656 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 868 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103472,64 +139077,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103537,13 +139143,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103553,6 +139162,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -103560,6 +139170,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -103569,6 +139180,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -103578,6 +139190,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -103596,37 +139209,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 657 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 869 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103634,16 +139245,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -103654,44 +139265,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 12416 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 10304 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103699,13 +139311,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103715,6 +139330,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -103722,6 +139338,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -103731,6 +139348,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -103740,6 +139358,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -103758,37 +139377,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 658 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 870 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103804,7 +139421,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -103812,48 +139429,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 32 - LVCA: 4 + LSPA: 32 + LSPB: 8 + LVCA: 8 LVCB: 8 LVPA: 16 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103861,13 +139479,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103877,6 +139496,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -103884,6 +139504,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -103893,6 +139514,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -103902,6 +139524,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -103920,29 +139543,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 659 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM64 + SolutionIndex: 871 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x16_SE_EPS1_FL1_GRVW2_LPA0_LPB0_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -103950,7 +139573,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103965,9 +139588,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -103978,44 +139601,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104023,13 +139647,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104039,6 +139666,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -104046,6 +139674,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -104055,6 +139684,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -104064,6 +139694,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -104082,37 +139713,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 660 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + SolutionIndex: 872 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104120,7 +139749,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -104128,8 +139757,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -104137,32 +139766,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 + LSCA: 8 + LSCB: 8 + LSPA: 96 LSPB: 64 - LVCA: 8 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 24 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104173,11 +139803,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104185,13 +139815,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104201,6 +139832,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -104208,6 +139840,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -104217,6 +139850,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -104226,6 +139860,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -104244,37 +139879,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 661 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 873 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104282,15 +139917,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -104306,25 +139941,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 13568 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104335,10 +139971,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -104347,13 +139983,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104363,13 +140002,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -104379,6 +140020,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -104388,6 +140030,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -104406,37 +140049,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 662 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 + SolutionIndex: 874 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104444,15 +140085,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -104468,25 +140109,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104497,11 +140139,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104509,13 +140151,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104525,6 +140170,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -104532,6 +140178,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -104541,6 +140188,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -104550,6 +140198,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -104568,8 +140217,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 663 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 875 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -104577,12 +140226,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -104592,13 +140241,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104606,16 +140253,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -104623,32 +140270,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 13568 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104659,11 +140307,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104671,13 +140319,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104687,13 +140338,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -104703,6 +140356,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -104712,6 +140366,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -104730,20 +140385,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 664 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM1 + SolutionIndex: 876 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -104751,16 +140406,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104768,14 +140421,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -104792,25 +140445,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104821,11 +140475,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104833,13 +140487,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104849,6 +140506,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -104856,6 +140514,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -104865,6 +140524,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -104874,6 +140534,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -104892,8 +140553,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 665 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 877 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -104901,12 +140562,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -104914,15 +140575,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104930,7 +140589,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -104954,25 +140613,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104983,10 +140643,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -104995,13 +140655,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105011,6 +140672,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -105018,6 +140680,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -105027,6 +140690,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -105036,6 +140700,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -105054,8 +140719,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 666 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 878 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -105063,11 +140728,11 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -105076,15 +140741,15 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105092,14 +140757,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -105116,25 +140781,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105145,7 +140811,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -105159,11 +140825,14 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105173,6 +140842,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -105180,6 +140850,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -105189,6 +140860,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -105198,6 +140870,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -105216,8 +140889,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 667 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 879 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -105225,7 +140898,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -105238,15 +140911,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105254,7 +140925,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -105263,7 +140934,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -105278,25 +140949,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105307,11 +140979,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105323,9 +140995,10 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105335,6 +141008,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -105342,6 +141016,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -105351,6 +141026,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -105360,6 +141036,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -105378,8 +141055,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 668 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 880 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -105387,12 +141064,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -105400,15 +141077,15 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105424,8 +141101,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -105436,44 +141113,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 32 - LVCA: 4 - LVCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105482,12 +141160,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105497,6 +141176,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -105504,6 +141184,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -105542,28 +141223,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 669 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 881 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -105572,7 +141253,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105587,16 +141268,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -105604,13 +141285,14 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false @@ -105641,17 +141323,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105661,6 +141348,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -105668,6 +141356,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -105690,6 +141379,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -105706,8 +141396,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 670 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 882 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -105715,7 +141405,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -105731,12 +141421,10 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105744,15 +141432,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -105760,62 +141448,68 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105825,6 +141519,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -105832,6 +141527,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -105854,6 +141550,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -105870,37 +141567,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 671 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM1 + SolutionIndex: 883 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105908,14 +141603,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -105928,27 +141623,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -105961,25 +141657,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105989,6 +141690,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -105996,6 +141698,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -106018,6 +141721,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106034,16 +141738,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 672 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM8 + SolutionIndex: 884 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -106054,17 +141758,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -106079,9 +141781,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -106092,58 +141794,64 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 + LSPA: 64 + LSPB: 64 LVCA: 4 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106153,6 +141861,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -106160,6 +141869,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -106182,6 +141892,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106198,37 +141909,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 673 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW2_WG4_4_8_WGM8 + SolutionIndex: 885 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -106236,7 +141945,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -106244,7 +141953,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -106252,62 +141961,66 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106317,6 +142030,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -106324,6 +142038,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -106346,6 +142061,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106362,37 +142078,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 674 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM8 + SolutionIndex: 886 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -106400,7 +142116,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -106409,7 +142125,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -106420,58 +142136,62 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106481,6 +142201,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -106488,6 +142209,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -106510,6 +142232,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106526,37 +142249,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 675 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM16 + SolutionIndex: 887 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -106564,7 +142287,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -106584,27 +142307,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -106617,25 +142341,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106645,6 +142372,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -106652,6 +142380,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -106674,6 +142403,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106690,16 +142420,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 676 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM64 + SolutionIndex: 888 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -106710,17 +142440,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -106728,16 +142458,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -106748,58 +142478,58 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106809,13 +142539,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -106838,6 +142570,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106854,37 +142587,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 677 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM64 + SolutionIndex: 889 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_PGR0_PLR1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 64 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -106892,78 +142625,84 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 8 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6656 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106973,6 +142712,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -106980,6 +142720,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -107002,6 +142743,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -107018,37 +142760,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 678 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT8_4_VW2_WG4_4_8_WGM64 + SolutionIndex: 890 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -107056,15 +142796,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -107072,62 +142812,68 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 32 - LSPB: 8 - LVCA: 2 - LVCB: 4 - LVPA: 8 - LVPB: 4 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -107137,6 +142883,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -107144,6 +142891,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -107166,6 +142914,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -107182,37 +142931,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 679 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM1 + SolutionIndex: 891 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 2 - SubGroupA: 8 - SubGroupB: 2 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 2, 4] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -107227,9 +142974,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -107244,23 +142991,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 LVPA: 32 LVPB: 32 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107274,24 +143022,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -107301,13 +143054,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -107330,6 +143085,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -107346,8 +143102,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 680 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 892 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -107355,12 +143111,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -107371,12 +143127,10 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -107384,14 +143138,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -107408,23 +143162,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107437,25 +143192,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -107465,13 +143225,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -107494,6 +143256,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -107510,37 +143273,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 681 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM1 + SolutionIndex: 893 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -107555,7 +143316,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -107572,54 +143333,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -107629,6 +143396,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -107636,6 +143404,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -107658,6 +143427,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -107674,37 +143444,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 682 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM1 + SolutionIndex: 894 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 8 - SubGroupA: 2 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [2, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -107718,8 +143486,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -107736,6 +143504,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -107746,9 +143515,13 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107763,23 +143536,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -107789,13 +143567,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -107818,6 +143598,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -107834,8 +143615,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 683 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 895 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -107844,11 +143625,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -107856,15 +143637,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -107872,7 +143651,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -107880,70 +143659,74 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -107953,6 +143736,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -107960,6 +143744,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -107982,6 +143767,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -107998,37 +143784,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 684 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM1 + SolutionIndex: 896 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [4, 4, 8] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -108036,7 +143822,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -108045,7 +143831,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -108056,58 +143842,62 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 8 - LVCA: 2 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -108117,6 +143907,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -108124,6 +143915,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -108146,6 +143938,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -108162,37 +143955,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 685 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM8 + SolutionIndex: 897 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 2 - SubGroupA: 8 - SubGroupB: 2 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 2, 4] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -108208,7 +144001,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -108224,23 +144017,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 + LSPA: 64 + LSPB: 128 + LVCA: 4 LVCB: 2 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -108254,24 +144048,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -108281,13 +144078,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -108310,6 +144109,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -108326,28 +144126,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 686 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM8 + SolutionIndex: 898 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -108356,7 +144156,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -108364,7 +144164,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -108381,61 +144181,65 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -108445,6 +144249,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -108452,6 +144257,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -108474,6 +144280,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -108490,37 +144297,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 687 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT4_4_VW4_WG4_4_8_WGM8 + SolutionIndex: 899 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -108552,6 +144359,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -108562,13 +144370,13 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -108583,23 +144391,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -108609,6 +144420,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -108616,6 +144428,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -108638,6 +144451,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -108654,8 +144468,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 688 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 900 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -108663,12 +144477,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -108676,7 +144490,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -108684,7 +144498,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -108699,16 +144513,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -108716,23 +144530,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -108746,24 +144561,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -108773,13 +144593,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -108802,6 +144624,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -108818,20 +144641,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 689 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM64 + SolutionIndex: 901 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -108839,16 +144662,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -108856,23 +144677,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -108880,54 +144701,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 4 - LVPB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -108937,13 +144764,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -108966,6 +144795,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -108982,16 +144812,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 690 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM64 + SolutionIndex: 902 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 8 - SubGroupA: 2 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -109003,16 +144833,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [2, 8, 4] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -109020,14 +144848,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -109040,58 +144868,64 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -109101,13 +144935,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -109130,6 +144966,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -109146,16 +144983,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 691 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG4_4_8_WGM64 + SolutionIndex: 903 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -109166,17 +145003,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -109184,16 +145019,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -109204,58 +145039,64 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -109265,13 +145106,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -109294,6 +145137,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -109310,20 +145154,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 692 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM64 + SolutionIndex: 904 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -109331,16 +145175,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -109355,7 +145197,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -109368,10 +145210,11 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -109382,9 +145225,9 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -109403,23 +145246,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -109429,6 +145277,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -109459,6 +145308,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -109475,8 +145325,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 693 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 905 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -109484,28 +145334,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -109513,47 +145361,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -109566,27 +145415,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -109596,8 +145446,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -109626,6 +145477,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -109642,8 +145494,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 694 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 906 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -109651,26 +145503,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -109678,47 +145532,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -109731,27 +145586,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -109761,8 +145617,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -109791,6 +145648,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -109807,8 +145665,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 695 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 907 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -109816,26 +145674,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -109843,7 +145703,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -109851,39 +145711,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -109896,25 +145757,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -109924,6 +145788,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -109954,6 +145819,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -109970,8 +145836,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 696 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 908 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -109979,28 +145845,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -110008,7 +145874,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -110017,7 +145883,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -110032,23 +145898,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -110061,25 +145928,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -110089,8 +145959,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -110119,6 +145990,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -110135,8 +146007,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 697 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 909 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -110144,12 +146016,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -110157,15 +146029,15 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -110173,16 +146045,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -110193,27 +146065,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -110226,27 +146099,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -110256,8 +146130,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -110286,6 +146161,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -110302,8 +146178,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 698 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 910 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -110311,26 +146187,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -110338,16 +146216,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -110362,23 +146240,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -110391,27 +146270,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -110421,8 +146301,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -110451,6 +146332,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -110467,8 +146349,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 699 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 911 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -110476,12 +146358,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -110489,13 +146371,15 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -110503,7 +146387,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -110512,7 +146396,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -110527,23 +146411,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -110556,25 +146441,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -110584,6 +146472,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -110614,6 +146503,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -110630,8 +146520,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 700 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 912 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -110639,14 +146529,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -110654,13 +146544,13 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -110675,30 +146565,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false @@ -110729,17 +146620,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -110749,8 +146645,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -110779,6 +146676,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -110795,8 +146693,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 701 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 913 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -110804,7 +146702,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 @@ -110815,17 +146713,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -110841,29 +146737,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false @@ -110894,19 +146791,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -110916,8 +146816,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -110946,6 +146847,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -110962,8 +146864,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 702 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 914 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -110971,18 +146873,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -110990,7 +146892,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -111006,15 +146908,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -111022,13 +146924,14 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false @@ -111059,19 +146962,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -111081,8 +146987,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -111111,6 +147018,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -111127,8 +147035,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 703 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 915 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -111136,7 +147044,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 @@ -111149,13 +147057,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -111170,16 +147078,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -111187,13 +147095,14 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false @@ -111224,17 +147133,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -111244,6 +147158,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -111274,6 +147189,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -111290,8 +147206,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 704 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 916 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -111299,28 +147215,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -111335,16 +147249,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -111352,13 +147266,14 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false @@ -111389,17 +147304,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -111409,6 +147329,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -111439,6 +147360,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -111455,8 +147377,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 705 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 917 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -111464,7 +147386,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 @@ -111480,12 +147402,10 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -111501,41 +147421,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -111547,26 +147468,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -111576,6 +147500,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -111606,6 +147531,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -111622,8 +147548,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 706 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 918 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -111631,18 +147557,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -111650,7 +147576,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -111666,15 +147592,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -111682,25 +147608,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -111712,26 +147639,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -111741,6 +147671,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -111771,6 +147702,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -111787,8 +147719,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 707 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 919 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -111796,11 +147728,11 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -111815,7 +147747,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -111843,10 +147775,11 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -111857,15 +147790,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -111877,24 +147810,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -111904,8 +147840,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -111934,6 +147871,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -111950,8 +147888,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 708 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 920 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -111959,18 +147897,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -111980,7 +147918,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -112008,10 +147946,11 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -112022,15 +147961,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -112042,24 +147981,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -112069,6 +148011,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -112099,6 +148042,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -112115,8 +148059,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 709 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 921 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -112124,18 +148068,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -112145,7 +148089,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -112161,41 +148105,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7200 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -112207,24 +148152,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -112234,8 +148182,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -112264,6 +148213,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -112280,8 +148230,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 710 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 922 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -112289,20 +148239,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -112310,7 +148260,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -112325,42 +148275,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -112372,26 +148323,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -112401,6 +148353,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -112431,6 +148384,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -112447,8 +148401,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 711 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 923 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -112456,26 +148410,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -112490,16 +148446,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -112507,25 +148463,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -112537,26 +148494,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -112566,6 +148524,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -112596,6 +148555,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -112612,8 +148572,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 712 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 924 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -112621,26 +148581,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -112655,42 +148617,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -112702,26 +148665,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -112731,8 +148695,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -112761,6 +148726,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -112777,8 +148743,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 713 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 925 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -112786,26 +148752,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -112820,42 +148788,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -112867,26 +148836,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -112896,8 +148866,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -112926,6 +148897,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -112942,8 +148914,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 714 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 926 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -112951,26 +148923,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -112985,42 +148959,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -113032,26 +149007,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -113061,6 +149037,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -113091,6 +149068,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -113107,8 +149085,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 715 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 927 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -113116,26 +149094,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -113151,41 +149131,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -113197,24 +149178,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -113224,6 +149208,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -113254,6 +149239,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -113270,8 +149256,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 716 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 928 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -113279,20 +149265,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -113300,7 +149286,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -113315,71 +149301,77 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -113389,6 +149381,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -113419,6 +149412,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -113435,37 +149429,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 717 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 929 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -113482,14 +149474,14 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -113497,56 +149489,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -113556,6 +149552,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -113586,6 +149583,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -113602,35 +149600,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 718 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 930 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -113646,7 +149644,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -113654,64 +149652,68 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 32 - LVCA: 8 + LVCA: 4 LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -113721,6 +149723,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -113751,6 +149754,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -113767,35 +149771,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 719 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 931 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -113803,14 +149807,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -113827,25 +149831,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -113856,25 +149861,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -113884,8 +149894,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -113914,12 +149925,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -113930,8 +149943,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 720 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 932 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -113939,28 +149952,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -113968,14 +149979,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -113992,25 +150003,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -114021,25 +150033,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -114049,6 +150066,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -114079,12 +150097,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -114095,8 +150115,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 721 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 933 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -114104,12 +150124,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -114119,13 +150139,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -114133,7 +150151,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -114157,25 +150175,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -114186,27 +150205,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -114216,6 +150238,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -114246,12 +150269,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -114262,8 +150287,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 722 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 934 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -114271,26 +150296,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -114298,7 +150323,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -114314,7 +150339,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -114322,25 +150347,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -114351,27 +150377,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -114381,6 +150410,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -114411,12 +150441,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -114427,8 +150459,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 723 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 935 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -114436,11 +150468,11 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -114451,11 +150483,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -114463,14 +150495,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -114479,7 +150511,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -114487,25 +150519,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -114516,25 +150549,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -114544,6 +150582,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -114574,12 +150613,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -114590,8 +150631,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 724 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 936 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -114599,28 +150640,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -114634,17 +150673,17 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -114652,25 +150691,22 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -114682,26 +150718,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -114711,7 +150750,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -114741,12 +150781,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -114757,8 +150799,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 725 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 937 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -114766,11 +150808,11 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -114779,13 +150821,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -114800,16 +150842,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -114817,25 +150859,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -114847,24 +150890,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -114874,6 +150922,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -114904,12 +150953,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -114920,8 +150971,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 726 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 938 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -114929,11 +150980,11 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -114942,15 +150993,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -114982,6 +151031,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -114992,15 +151042,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -115013,25 +151063,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -115041,6 +151094,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -115071,12 +151125,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -115087,8 +151143,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 727 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 939 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -115096,26 +151152,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -115123,7 +151179,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -115131,15 +151187,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -115147,25 +151203,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -115176,25 +151233,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -115204,8 +151264,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -115234,12 +151295,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -115250,8 +151313,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 728 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 940 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -115259,12 +151322,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -115274,13 +151337,13 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -115288,44 +151351,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -115337,25 +151405,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -115365,7 +151436,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -115395,12 +151467,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -115411,37 +151485,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 729 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB0_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 941 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -115449,45 +151523,46 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -115498,27 +151573,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -115528,6 +151604,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: @@ -115558,12 +151635,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -115574,35 +151653,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 730 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 942 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS0_FL1_GRVW2_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -115616,39 +151697,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -115660,26 +151746,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -115689,8 +151776,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -115719,12 +151807,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -115735,35 +151825,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 731 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 943 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -115771,16 +151863,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -115791,25 +151883,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2128 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -115820,7 +151917,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -115828,19 +151925,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -115850,7 +151948,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -115880,12 +151979,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -115896,8 +151997,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 732 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 944 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -115916,15 +152017,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -115932,16 +152035,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -115952,25 +152055,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2128 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -115981,7 +152089,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -115989,17 +152097,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -116009,7 +152120,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -116039,12 +152151,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -116055,8 +152169,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 733 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 945 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116075,17 +152189,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -116100,7 +152214,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -116113,25 +152227,26 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116143,26 +152258,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -116172,6 +152288,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: @@ -116202,12 +152319,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -116218,35 +152337,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 734 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 946 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -116254,16 +152375,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -116274,25 +152395,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 96 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 24 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116303,25 +152429,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -116331,7 +152460,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -116361,12 +152491,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -116377,37 +152509,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 735 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 947 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT6_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -116415,16 +152547,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -116435,25 +152567,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 96 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 24 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116464,27 +152601,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -116494,7 +152632,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -116524,12 +152663,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -116540,35 +152681,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 736 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 + SolutionIndex: 948 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT6_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -116576,45 +152719,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116625,25 +152773,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -116653,7 +152806,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -116683,12 +152837,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -116699,37 +152855,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 737 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 + SolutionIndex: 949 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -116737,15 +152891,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -116757,25 +152911,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 64 + LSPB: 128 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2128 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116786,19 +152945,21 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 1 NumLoadsB: 1 @@ -116807,6 +152968,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -116816,7 +152978,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -116846,12 +153009,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -116862,8 +153027,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 738 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 950 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116872,25 +153037,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -116898,15 +153063,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -116918,25 +153083,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 64 + LSPB: 128 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2128 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116947,25 +153117,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -116975,7 +153150,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -117005,12 +153181,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -117021,8 +153199,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 739 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 951 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -117031,27 +153209,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -117059,15 +153235,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -117079,25 +153255,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -117108,19 +153289,21 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 @@ -117128,7 +153311,8 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -117138,7 +153322,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -117168,12 +153353,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -117184,35 +153371,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 740 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 952 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -117220,45 +153407,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -117269,25 +153461,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -117297,7 +153492,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -117327,12 +153523,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -117343,37 +153541,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 741 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 953 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -117381,16 +153579,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -117401,25 +153599,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -117430,19 +153633,21 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 @@ -117450,7 +153655,8 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -117460,8 +153666,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -117490,12 +153697,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -117506,35 +153715,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 742 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM8 + SolutionIndex: 954 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT8_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -117542,16 +153751,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -117562,25 +153771,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2144 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -117591,27 +153805,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -117621,7 +153838,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -117651,12 +153869,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -117667,35 +153887,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 743 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 955 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -117709,9 +153929,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -117719,7 +153939,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -117727,21 +153947,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 8 LSCB: 8 LSPA: 64 - LSPB: 32 - LVCA: 2 + LSPB: 64 + LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 864 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -117753,24 +153978,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -117780,8 +154008,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -117810,12 +154039,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -117826,20 +154057,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 744 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 956 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -117847,8 +154078,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -117856,7 +154087,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -117864,16 +154095,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -117888,21 +154119,22 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -117913,27 +154145,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -117943,6 +154176,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: @@ -117973,12 +154207,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -117989,20 +154225,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 745 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 957 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS0_FL1_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT8_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -118010,14 +154246,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -118031,7 +154269,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -118049,21 +154287,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -118075,24 +154318,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -118102,7 +154348,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -118132,12 +154379,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -118148,20 +154397,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 746 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 958 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -118169,7 +154418,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -118178,7 +154427,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -118186,13 +154435,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -118210,21 +154459,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -118235,25 +154489,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -118263,8 +154520,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -118293,12 +154551,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -118309,37 +154569,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 747 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 + SolutionIndex: 959 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT8_8_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -118347,13 +154607,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -118371,21 +154631,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -118396,25 +154661,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -118424,7 +154692,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -118454,12 +154723,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -118470,31 +154741,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 748 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 960 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_8_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -118525,7 +154796,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -118543,13 +154814,13 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 1536 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -118563,24 +154834,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -118621,12 +154895,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -118637,8 +154913,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 749 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT6_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 961 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -118647,10 +154923,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -118662,7 +154938,7 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 32 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -118675,7 +154951,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -118683,8 +154959,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -118695,61 +154971,64 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -118790,12 +155069,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -118806,31 +155087,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 750 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 962 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -118842,7 +155123,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -118850,15 +155131,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -118868,55 +155149,58 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -118928,7 +155212,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -118957,12 +155241,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -118973,31 +155259,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 751 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 963 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119009,15 +155295,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -119029,28 +155315,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -119063,25 +155349,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -119122,12 +155413,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -119138,15 +155431,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 752 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 + SolutionIndex: 964 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -119158,13 +155451,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119183,7 +155474,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -119204,51 +155495,56 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 + LSPA: 32 LSPB: 64 LVCA: 8 LVCB: 4 - LVPA: 8 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -119289,12 +155585,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -119305,14 +155603,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 753 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG4_16_4_WGM8 + SolutionIndex: 965 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -119326,12 +155624,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119350,7 +155646,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -119371,48 +155667,48 @@ KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 64 + LSPA: 32 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 32 + LVPA: 16 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -119459,12 +155755,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -119475,14 +155773,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 754 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 966 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -119496,10 +155794,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119518,7 +155818,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -119539,48 +155839,48 @@ KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 64 + LSPA: 32 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 32 + LVPA: 16 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3168 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 576 - LdsOffsetB_Blk: 2624 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -119627,12 +155927,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -119643,14 +155945,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 755 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 967 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -119664,10 +155966,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119679,23 +155983,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -119705,44 +156009,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2144 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -119761,8 +156071,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -119791,12 +156101,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -119807,20 +156119,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 756 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 968 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_FL0_GRVW1_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -119828,10 +156140,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119843,23 +156155,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -119869,44 +156181,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2176 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -119925,7 +156243,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -119955,12 +156273,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -119971,15 +156291,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 757 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 969 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -119992,10 +156312,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120007,23 +156327,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -120033,48 +156353,52 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -120121,12 +156445,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -120137,15 +156463,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 758 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 970 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -120158,12 +156484,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120181,10 +156505,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -120192,10 +156516,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -120204,41 +156528,49 @@ LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2176 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -120255,7 +156587,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -120285,12 +156617,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -120301,15 +156635,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 759 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 971 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -120321,13 +156655,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120339,16 +156671,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -120365,22 +156697,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 32 LVCA: 4 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -120393,7 +156725,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -120401,12 +156733,16 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -120453,12 +156789,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -120469,8 +156807,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 760 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + SolutionIndex: 972 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -120493,9 +156831,7 @@ WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120507,80 +156843,80 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 8 + LoopUnroll: 4 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 8 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -120623,12 +156959,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -120639,31 +156977,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 761 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_LPA0_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 973 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120675,76 +157015,76 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 + LSCA: 8 + LSCB: 8 + LSPA: 64 LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3456 - LdsNumElementsAlignedA: 320 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 320 - LdsOffsetB_Blk: 2368 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 8 + LoopUnroll: 4 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 8 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -120791,12 +157131,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -120807,31 +157149,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 762 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 974 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120843,54 +157187,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3456 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -120898,24 +157242,24 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -120959,12 +157303,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -120975,31 +157321,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 763 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + SolutionIndex: 975 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121011,16 +157359,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -121031,34 +157379,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3456 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -121066,21 +157414,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -121127,12 +157475,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -121143,31 +157493,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 764 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 + SolutionIndex: 976 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121207,20 +157559,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 8 + LSPA: 8 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 16 - LVPB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -121234,14 +157586,16 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -121293,12 +157647,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -121309,15 +157665,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 765 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x16_SE_EPS1_FL1_GRVW2_LPA0_LPB0_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + SolutionIndex: 977 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -121330,7 +157686,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -121354,9 +157710,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -121367,7 +157723,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -121375,20 +157731,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 + LSPA: 8 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 16 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -121402,21 +157758,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -121463,12 +157819,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -121479,31 +157837,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 766 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 + SolutionIndex: 978 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121515,7 +157875,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -121523,7 +157883,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -121535,54 +157895,56 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 96 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 24 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -121629,12 +157991,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -121645,31 +158009,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 767 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 979 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -121683,7 +158047,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -121691,66 +158055,68 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 128 - LVCA: 4 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -121770,7 +158136,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -121799,12 +158165,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -121815,31 +158183,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 768 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 980 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121851,7 +158219,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -121859,8 +158227,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -121871,54 +158239,56 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 128 - LVCA: 4 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -121967,12 +158337,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -121983,31 +158355,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 769 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 981 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122019,16 +158391,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -122039,56 +158411,56 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 + LSCA: 16 + LSCB: 16 + LSPA: 8 LSPB: 64 - LVCA: 2 + LVCA: 8 LVCB: 4 - LVPA: 32 - LVPB: 32 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 128 + LdsOffsetB_Blk: 2176 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 4 + MacroTile0: 8 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 8 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -122106,7 +158478,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -122135,12 +158507,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -122151,31 +158525,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 770 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 982 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122195,7 +158571,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -122207,7 +158583,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -122215,52 +158591,54 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 16 LSPB: 64 - LVCA: 4 + LVCA: 16 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -122303,12 +158681,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -122319,28 +158699,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 771 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 983 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -122355,15 +158735,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -122375,54 +158755,58 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -122469,12 +158853,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -122485,33 +158871,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 772 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 984 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122531,7 +158915,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -122551,46 +158935,48 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 16 LSPB: 64 - LVCA: 4 + LVCA: 16 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -122639,12 +159025,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -122655,14 +159043,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 773 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 985 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -122676,8 +159064,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -122691,16 +159079,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -122717,48 +159105,52 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 + LSCA: 16 + LSCB: 16 + LSPA: 16 LSPB: 64 - LVCA: 2 + LVCA: 16 LVCB: 4 - LVPA: 32 - LVPB: 32 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -122805,12 +159197,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -122821,20 +159215,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 774 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 986 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -122842,12 +159236,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122866,8 +159258,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -122876,7 +159268,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -122887,46 +159279,50 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 16 LSPB: 64 - LVCA: 8 + LVCA: 16 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -122973,12 +159369,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -122989,14 +159387,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 775 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 + SolutionIndex: 987 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -123010,12 +159408,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123036,7 +159432,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123056,36 +159452,36 @@ LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 16 LVCA: 8 - LVCB: 8 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123095,14 +159491,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -123152,6 +159548,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -123162,20 +159559,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 776 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 988 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -123183,7 +159580,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -123207,7 +159604,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123218,7 +159615,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -123227,36 +159624,36 @@ LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 16 LVCA: 8 - LVCB: 8 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123266,14 +159663,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -123323,6 +159720,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -123333,29 +159731,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 777 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 989 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -123377,8 +159775,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123389,7 +159787,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -123397,37 +159795,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123437,8 +159835,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -123494,6 +159892,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -123504,28 +159903,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 778 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 990 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -123547,9 +159946,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123560,7 +159959,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -123568,37 +159967,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 8 LVPA: 16 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123608,10 +160007,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -123665,6 +160062,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -123675,31 +160073,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 779 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 991 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123718,20 +160118,20 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -123739,16 +160139,16 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -123758,18 +160158,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123779,12 +160179,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -123834,6 +160236,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -123844,15 +160247,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 780 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 992 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -123864,13 +160267,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123889,9 +160290,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123911,15 +160312,15 @@ LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 16 LVCA: 4 - LVCB: 4 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -123929,18 +160330,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123950,8 +160351,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -124005,6 +160408,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -124015,15 +160419,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 781 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 993 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -124036,12 +160440,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124060,9 +160462,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -124070,10 +160472,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -124082,15 +160484,15 @@ LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 16 LVCA: 4 - LVCB: 4 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -124100,18 +160502,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124121,8 +160523,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -124176,6 +160580,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -124186,15 +160591,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 782 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 994 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -124206,13 +160611,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124230,10 +160633,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -124253,32 +160656,36 @@ LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 16 LVCA: 4 - LVCB: 4 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124288,8 +160695,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -124306,7 +160715,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -124343,6 +160752,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -124353,15 +160763,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 783 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_PGR0_PLR1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 995 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -124374,12 +160784,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124399,19 +160807,19 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -124419,37 +160827,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6656 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124459,14 +160867,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -124516,6 +160924,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -124526,29 +160935,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 784 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM1 + SolutionIndex: 996 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -124562,14 +160971,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -124578,49 +160987,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 2 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124630,14 +161039,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -124687,6 +161094,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -124697,31 +161105,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 785 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM8 + SolutionIndex: 997 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124733,7 +161143,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -124742,14 +161152,14 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -124759,24 +161169,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 128 - LVCA: 4 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -124787,11 +161197,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124801,14 +161211,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -124822,7 +161232,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -124858,6 +161268,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -124868,8 +161279,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 786 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 998 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -124878,11 +161289,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -124892,7 +161303,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124904,23 +161315,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -124930,24 +161341,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -124958,11 +161369,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124972,14 +161383,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -125029,6 +161438,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -125039,8 +161449,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 787 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 999 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA4_LPB4_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125049,11 +161459,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -125061,9 +161471,11 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125075,14 +161487,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -125095,30 +161507,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125129,11 +161541,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125143,14 +161555,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -125200,6 +161610,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -125210,8 +161621,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 788 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 1000 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT4_2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125220,21 +161631,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125254,7 +161667,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -125266,7 +161679,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -125274,22 +161687,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125301,10 +161714,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125314,14 +161727,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -125371,6 +161784,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -125381,8 +161795,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 789 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 1001 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125391,17 +161805,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -125417,7 +161831,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -125426,41 +161840,41 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 + LSCA: 16 + LSCB: 16 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 - LVPA: 32 - LVPB: 32 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125471,11 +161885,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125485,12 +161899,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -125540,6 +161954,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -125550,8 +161965,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 790 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 1002 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125560,21 +161975,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -125595,7 +162010,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -125623,15 +162038,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125658,6 +162073,8 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -125711,6 +162128,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -125721,8 +162139,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 791 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW2_WG16_16_1_WGM8 + SolutionIndex: 1003 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125743,11 +162161,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125766,7 +162182,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -125794,14 +162210,14 @@ LVPA: 32 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 576 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 - LdsPadA: 0 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -125829,6 +162245,8 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -125846,7 +162264,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -125882,6 +162300,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -125892,8 +162311,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 792 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 1004 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA4_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125914,11 +162333,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125937,7 +162354,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -125950,7 +162367,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -125965,15 +162382,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125985,10 +162402,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125999,11 +162416,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -126053,6 +162472,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -126063,8 +162483,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 793 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 1005 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT8_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -126073,23 +162493,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126108,16 +162526,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -126129,22 +162547,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -126157,9 +162575,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126169,13 +162587,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -126224,6 +162644,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -126234,33 +162655,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 794 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 1006 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126272,7 +162691,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -126280,15 +162699,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -126298,24 +162717,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -126326,11 +162745,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126340,15 +162759,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -126361,7 +162780,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -126397,6 +162816,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -126407,20 +162827,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 795 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1007 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -126428,10 +162848,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126443,7 +162863,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -126451,42 +162871,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -126497,11 +162917,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126511,15 +162931,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -126532,7 +162952,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -126568,6 +162988,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -126578,31 +162999,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 796 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1008 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126614,7 +163035,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -126623,7 +163044,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -126640,24 +163061,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -126668,11 +163089,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126682,15 +163103,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -126703,7 +163124,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -126739,6 +163160,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -126749,20 +163171,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 797 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1009 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -126770,10 +163192,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126785,7 +163207,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -126793,7 +163215,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -126801,38 +163223,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -126840,10 +163262,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126853,14 +163275,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -126874,7 +163296,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -126910,6 +163332,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -126920,31 +163343,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 798 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1010 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126956,7 +163379,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -126964,57 +163387,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127024,14 +163447,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -127081,6 +163504,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -127091,31 +163515,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 799 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1011 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127127,15 +163551,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -127144,37 +163568,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -127182,10 +163606,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127195,12 +163619,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -127214,7 +163640,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -127250,6 +163676,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -127260,33 +163687,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 800 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1012 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127298,54 +163723,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -127353,10 +163778,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127366,11 +163791,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -127385,7 +163812,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -127421,6 +163848,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -127431,33 +163859,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 801 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1013 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127469,54 +163895,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -127524,10 +163950,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127537,11 +163963,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -127592,6 +164020,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -127602,33 +164031,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 802 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1014 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127640,15 +164067,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -127656,8 +164083,8 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -127666,28 +164093,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -127695,10 +164122,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127708,12 +164135,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -127763,6 +164192,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -127773,33 +164203,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 803 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1015 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR0_TT4_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127811,54 +164239,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -127866,10 +164294,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127879,11 +164307,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -127898,7 +164328,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -127934,6 +164364,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -127944,33 +164375,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 804 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1016 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW1_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127982,54 +164411,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -128037,10 +164466,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128050,11 +164479,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -128069,7 +164500,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -128105,6 +164536,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -128115,33 +164547,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 805 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1017 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB0_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128153,54 +164583,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -128208,10 +164638,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128221,11 +164651,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -128276,6 +164708,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -128286,33 +164719,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 806 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1018 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128324,7 +164755,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -128332,57 +164763,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 - LSPB: 32 + LSPB: 8 LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128392,14 +164823,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -128413,7 +164844,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -128449,6 +164880,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -128459,31 +164891,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 807 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 1019 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128495,7 +164927,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -128503,57 +164935,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 - LSPB: 32 + LSPB: 8 LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128563,14 +164995,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -128584,7 +165016,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -128620,6 +165052,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -128630,31 +165063,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 808 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM8 + SolutionIndex: 1020 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128666,7 +165099,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -128674,57 +165107,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128734,14 +165167,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -128755,7 +165188,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -128791,6 +165224,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -128801,31 +165235,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 809 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 1021 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128837,14 +165271,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -128854,48 +165288,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128905,14 +165339,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -128962,6 +165394,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -128972,31 +165405,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 810 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 1022 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129008,7 +165443,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -129025,48 +165460,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129076,14 +165511,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -129133,6 +165568,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -129143,31 +165579,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 811 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 1023 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129179,7 +165615,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -129188,7 +165624,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -129196,7 +165632,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -129205,39 +165641,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129247,14 +165683,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -129304,6 +165740,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -129314,31 +165751,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 812 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 1024 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129350,16 +165787,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -129367,48 +165804,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129418,14 +165855,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -129475,6 +165910,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -129485,31 +165921,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 813 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 1025 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129521,65 +165959,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 + LSCA: 32 + LSCB: 32 + LSPA: 8 LSPB: 32 - LVCA: 8 + LVCA: 32 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129589,12 +166027,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -129608,7 +166048,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -129644,6 +166084,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -129654,33 +166095,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 814 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 1026 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU4_LPA0_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129699,8 +166138,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -129708,11 +166147,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -129720,37 +166159,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 32 - LVCA: 8 + LVCA: 4 LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129760,12 +166199,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -129815,43 +166256,44 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 815 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 1027 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129870,16 +166312,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -129891,37 +166333,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129931,12 +166373,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -129950,7 +166394,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -129986,43 +166430,44 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 816 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 1028 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130041,16 +166486,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -130062,37 +166507,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130102,12 +166547,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -130157,43 +166604,44 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 817 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 1029 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130225,7 +166673,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -130240,15 +166688,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130260,10 +166708,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130273,12 +166721,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -130328,18 +166776,21 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 818 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 1030 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -130348,19 +166799,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -130376,7 +166827,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -130402,24 +166853,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130430,11 +166881,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130444,11 +166895,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -130463,7 +166914,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -130499,18 +166950,21 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 819 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 1031 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -130519,11 +166973,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -130531,9 +166985,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -130554,7 +167008,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -130567,7 +167021,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -130582,15 +167036,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130602,9 +167056,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -130615,11 +167069,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -130634,7 +167090,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -130670,18 +167126,21 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 820 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 1032 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -130690,23 +167149,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130724,8 +167181,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -130753,15 +167210,11 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3200 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130774,9 +167227,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130786,12 +167239,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -130804,7 +167259,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -130841,18 +167296,21 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 821 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 1033 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -130861,11 +167319,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -130873,11 +167331,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130909,7 +167365,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -130924,15 +167380,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130945,9 +167401,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130957,12 +167413,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -131012,18 +167468,21 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 822 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 1034 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT8_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -131032,17 +167491,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -131067,8 +167526,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -131088,36 +167547,36 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 64 - LVCA: 8 + LVCA: 4 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -131128,13 +167587,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -131185,30 +167642,33 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 823 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 + SolutionIndex: 1035 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -131216,10 +167676,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131239,15 +167701,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -131259,37 +167721,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 64 - LVCA: 8 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131299,15 +167761,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -131356,25 +167818,28 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 824 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM8 + SolutionIndex: 1036 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -131387,8 +167852,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -131410,19 +167875,19 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -131430,33 +167895,33 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -131470,15 +167935,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -131527,18 +167992,21 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 825 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 + SolutionIndex: 1037 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -131557,8 +168025,8 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -133817,24 +170285,10 @@ - [95, 6513.35] - - [4288, 1024, 1, 128] - [80, 4291.67] - - - [512, 2048, 1, 49] - - [126, 4554.98] - - - [512, 128, 1, 784] - - [119, 3195.29] - - - [2048, 512, 1, 49] - - [127, 4253.33] - - - [1024, 256, 1, 196] - - [123, 4039.33] - - [256, 64, 1, 3136] - [121, 3015.27] - - [256, 1024, 1, 196] - [125, 4225.35] - - - [64, 256, 1, 3136] - - [122, 3058.35] - - - [128, 512, 1, 784] - - [120, 3380.28] - - - [64, 64, 1, 3136] - - [124, 1372.34] - - [1024, 1024, 1, 3328] - [237, 8705.0] - - [2048, 200, 1, 3200] @@ -134667,8 +171121,6 @@ - [231, 5745.62] - - [1024, 200, 1, 1280] - [223, 4446.13] - - - [4096, 512, 1, 4096] - - [141, 9264.39] - - [2048, 256, 1, 3200] - [231, 7842.75] - - [2048, 512, 1, 15360] @@ -135169,64 +171621,28 @@ - [237, 6628.17] - - [4096, 1024, 1, 6144] - [139, 9592.98] - - - [1280, 384, 1, 64] - - [270, 3196.88] - - [256, 64, 1, 1225] - [271, 1194.67] - - [2048, 320, 1, 64] - [273, 3449.26] - - - [256, 48, 1, 1225] - - [264, 913.398] - - - [2048, 192, 1, 64] - - [263, 2516.58] - - [1024, 128, 1, 289] - [277, 2869.68] - - - [1280, 192, 1, 64] - - [256, 1872.46] - - - [192, 32, 1, 1225] - - [261, 505.806] - - - [1280, 448, 1, 64] - - [257, 3078.87] - - [384, 64, 1, 1225] - [262, 1511.33] - - [2048, 384, 1, 64] - [275, 3836.25] - - - [288, 48, 1, 1225] - - [258, 1032.59] - - [64, 80, 1, 5329] - [274, 888.167] - - [1024, 384, 1, 289] - [268, 4291.52] - - [2048, 448, 1, 64] - [267, 3783.52] - - - [1280, 320, 1, 64] - - [273, 2776.95] - - - [192, 64, 1, 1225] - - [258, 926.897] - - - [384, 192, 1, 1225] - - [269, 2560.0] - - - [1536, 256, 1, 64] - - [276, 2621.44] - - - [192, 48, 1, 1225] - - [261, 698.614] - - - [768, 128, 1, 289] - - [278, 2291.12] - - - [1024, 256, 1, 289] - - [276, 4064.36] - - [768, 192, 1, 289] - [272, 2690.33] - - - [1536, 384, 1, 64] - - [259, 3145.73] - - [288, 64, 1, 1225] - [261, 1142.67] - - - [1024, 192, 1, 289] - - [266, 3243.13] - - [384, 96, 1, 1225] - [279, 1844.71] - - - [160, 64, 1, 5329] - - [265, 1564.48] - - - [768, 160, 1, 289] - - [260, 2386.58] - - [1024, 3392, 1, 4096] - [305, 8502.92] - - [1024, 3301, 1, 4096] @@ -137411,5852 +173827,6340 @@ - [372, 5309.25] - - [2816, 8976, 1, 256] - [383, 9409.56] + - - [1728, 320, 1, 64] + - [419, 3205.57] + - - [1152, 128, 1, 784] + - [466, 3498.96] + - - [576, 96, 1, 5329] + - [452, 3947.92] + - - [864, 96, 1, 1225] + - [473, 3009.67] + - - [256, 128, 1, 784] + - [463, 1536.49] + - - [1440, 320, 1, 196] + - [416, 4824.62] + - - [192, 48, 1, 1225] + - [494, 820.465] + - - [2592, 384, 1, 289] + - [434, 7353.01] + - - [192, 80, 36, 10368] + - [484, 5360.04] + - - [896, 192, 1, 289] + - [451, 3076.56] + - - [768, 128, 1, 289] + - [476, 2351.81] + - - [64, 256, 1, 3136] + - [502, 1809.16] + - - [1280, 384, 1, 64] + - [416, 3171.1] + - - [512, 144, 1, 196] + - [474, 1445.07] + - - [1344, 192, 1, 289] + - [457, 4376.52] + - - [288, 64, 1, 21609] + - [468, 3396.12] + - - [400, 32, 1, 784] + - [495, 922.353] + - - [288, 32, 1, 21609] + - [506, 2816.01] + - - [1280, 448, 1, 64] + - [419, 3253.56] + - - [3456, 256, 1, 169] + - [431, 5822.44] + - - [2304, 256, 1, 196] + - [429, 4931.98] + - - [384, 192, 1, 1225] + - [477, 2720.39] + - - [832, 48, 1, 49] + - [472, 344.518] + - - [832, 192, 1, 49] + - [454, 1099.36] + - - [1280, 192, 1, 64] + - [455, 2069.56] + - - [192, 32, 1, 784] + - [494, 459.627] + - - [288, 48, 1, 1225] + - [501, 1176.0] + - - [512, 112, 1, 196] + - [469, 1277.21] + - - [224, 192, 36, 2592] + - [486, 7369.56] + - - [528, 32, 1, 196] + - [460, 440.374] + - - [192, 128, 36, 1568] + - [485, 8245.76] + - - [4032, 384, 1, 64] + - [430, 5898.24] + - - [576, 64, 1, 3136] + - [475, 2671.11] + - - [2048, 32, 1, 1001] + - [477, 2323.0] + - - [480, 64, 1, 196] + - [462, 752.64] + - - [512, 256, 1, 196] + - [464, 2528.55] + - - [864, 96, 1, 289] + - [474, 1958.4] + - - [896, 128, 1, 289] + - [477, 2725.73] + - - [192, 64, 1, 784] + - [492, 898.675] + - - [1200, 64, 1, 1225] + - [476, 2780.14] + - - [1296, 288, 1, 196] + - [415, 3826.18] + - - [576, 96, 1, 5041] + - [456, 3795.58] + - - [1024, 256, 1, 289] + - [445, 4488.13] + - - [1024, 2048, 1, 49] + - [435, 5077.1] + - - [192, 64, 36, 6272] + - [479, 7514.98] + - - [4096, 512, 1, 4096] + - [441, 10276.0] + - - [192, 32, 1, 1225] + - [495, 556.686] + - - [1024, 256, 1, 196] + - [455, 3892.44] + - - [1120, 192, 1, 289] + - [444, 3752.81] + - - [400, 48, 1, 196] + - [469, 480.0] + - - [1728, 224, 1, 1225] + - [422, 5575.77] + - - [800, 96, 1, 784] + - [476, 2668.94] + - - [1152, 384, 1, 64] + - [426, 3077.34] + - - [4608, 512, 1, 49] + - [433, 4676.6] + - - [1792, 256, 1, 289] + - [426, 5345.94] + - - [864, 128, 1, 784] + - [476, 3816.2] + - - [1728, 384, 1, 169] + - [428, 5191.68] + - - [480, 16, 1, 196] + - [497, 241.231] + - - [1568, 256, 1, 289] + - [416, 4723.41] + - - [1152, 448, 1, 64] + - [422, 3356.72] + - - [512, 64, 1, 196] + - [461, 802.816] + - - [1344, 224, 1, 289] + - [416, 3519.63] + - - [9216, 512, 1, 4096] + - [439, 9146.02] + - - [27, 32, 1, 22201] + - [507, 264.356] + - - [1152, 192, 1, 784] + - [446, 4904.08] + - - [1536, 256, 1, 64] + - [414, 2578.47] + - - [800, 128, 1, 196] + - [476, 1991.11] + - - [800, 64, 1, 196] + - [471, 1150.83] + - - [864, 208, 1, 196] + - [448, 2684.72] + - - [1440, 320, 1, 49] + - [417, 2313.44] + - - [512, 128, 1, 784] + - [467, 2780.32] + - - [720, 192, 1, 5041] + - [442, 5410.46] + - - [256, 64, 1, 784] + - [499, 1163.5] + - - [256, 48, 1, 1225] + - [494, 1075.2] + - - [576, 192, 1, 3136] + - [442, 4833.01] + - - [160, 64, 1, 5329] + - [496, 1753.5] + - - [3456, 384, 1, 289] + - [436, 7341.75] + - - [32, 32, 36, 43808] + - [490, 1378.03] + - - [1344, 512, 1, 64] + - [415, 3822.93] + - - [192, 16, 1, 784] + - [495, 228.073] + - - [3456, 384, 1, 169] + - [432, 6675.02] + - - [1152, 256, 1, 196] + - [425, 3211.26] + - - [1728, 192, 1, 1225] + - [426, 4852.26] + - - [2048, 512, 1, 49] + - [438, 3471.64] + - - [576, 96, 1, 1225] + - [469, 2176.66] + - - [512, 2048, 1, 49] + - [420, 3845.83] + - - [1728, 192, 1, 64] + - [415, 2369.83] + - - [832, 256, 1, 49] + - [445, 1433.6] + - - [512, 128, 1, 196] + - [470, 1459.67] + - - [1200, 128, 1, 49] + - [465, 1069.09] + - - [528, 256, 1, 196] + - [453, 2069.76] + - - [256, 512, 1, 784] + - [476, 4538.89] + - - [480, 192, 1, 196] + - [476, 1792.0] + - - [96, 64, 36, 2592] + - [483, 4845.41] + - - [96, 96, 36, 2592] + - [488, 5111.53] + - - [1024, 192, 1, 289] + - [450, 3431.14] + - - [1536, 384, 1, 64] + - [421, 3166.84] + - - [192, 96, 1, 784] + - [461, 881.14] + - - [2048, 192, 1, 64] + - [418, 2330.17] + - - [192, 64, 1, 1225] + - [500, 1100.35] + - - [512, 32, 1, 196] + - [491, 477.867] + - - [128, 96, 36, 1568] + - [487, 6649.09] + - - [528, 128, 1, 196] + - [473, 1403.23] + - - [128, 512, 1, 784] + - [463, 2237.81] + - - [128, 128, 36, 3136] + - [480, 6538.77] + - - [528, 160, 1, 196] + - [477, 1642.67] + - - [448, 64, 1, 5329] + - [452, 3264.81] + - - [1280, 320, 1, 64] + - [416, 2776.95] + - - [1792, 320, 1, 289] + - [428, 5204.9] + - - [2880, 320, 1, 64] + - [424, 4336.94] + - - [147, 64, 1, 12544] + - [505, 2430.27] + - - [4096, 512, 1, 1001] + - [440, 9618.99] + - - [1536, 32, 1, 1001] + - [477, 1757.18] + - - [512, 160, 1, 196] + - [473, 1592.89] + - - [768, 160, 1, 289] + - [474, 2757.17] + - - [1728, 384, 1, 49] + - [426, 3102.49] + - - [64, 32, 36, 43808] + - [481, 2626.43] + - - [64, 64, 1, 3136] + - [493, 610.506] + - - [256, 32, 1, 784] + - [494, 612.837] + - - [480, 96, 1, 196] + - [469, 1055.1] + - - [1024, 32, 1, 1001] + - [459, 1188.43] + - - [832, 160, 1, 49] + - [474, 959.247] + - - [512, 1024, 1, 196] + - [417, 4978.7] + - - [96, 64, 36, 10368] + - [511, 5000.95] + - - [384, 448, 36, 512] + - [516, 8903.0] + - - [2048, 64, 1, 1001] + - [509, 4385.13] + - - [224, 192, 36, 5184] + - [515, 7487.81] + - - [2048, 128, 1, 1001] + - [508, 5764.63] + - - [96, 96, 36, 10368] + - [517, 5275.21] + - - [192, 80, 36, 20736] + - [513, 5409.4] + - - [96, 64, 36, 5184] + - [511, 4911.83] + - - [1536, 64, 1, 1001] + - [510, 3162.03] + - - [96, 64, 36, 20736] + - [512, 5034.33] + - - [384, 448, 36, 256] + - [514, 8815.87] + - - [96, 96, 36, 5184] + - [518, 5236.02] - - [1024, 128, 1, 128] - - [425, 896.319] + - [531, 896.319] - - [4, 704, 1, 1280] - - [462, 328.976] + - [568, 328.976] - - [4, 1856, 1, 3328] - - [472, 501.461] + - [578, 501.461] - - [1856, 448, 1, 3328] - - [517, 5678.01] + - [623, 5678.01] - - [2944, 4288, 1, 1280] - - [503, 8412.49] + - [609, 8412.49] - - [2368, 64, 1, 3328] - - [453, 4914.02] + - [559, 4914.02] - - [1760, 32, 1, 1760] - - [480, 3313.04] + - [586, 3313.04] - - [2368, 5888, 1, 256] - - [503, 6489.82] + - [609, 6489.82] - - [5888, 1856, 1, 256] - - [515, 7791.98] + - [621, 7791.98] - - [128, 64, 1, 256] - - [487, 369.317] + - [593, 369.317] - - [512, 24000, 1, 1536] - - [509, 8827.47] + - [615, 8827.47] - - [128, 6784, 1, 3328] - - [509, 6537.09] + - [615, 6537.09] - - [5888, 1408, 1, 256] - - [523, 6129.71] + - [629, 6129.71] - - [5888, 1856, 1, 3328] - - [509, 7969.27] + - [615, 7969.27] - - [5056, 704, 1, 256] - - [509, 6723.92] + - [615, 6723.92] - - [2048, 400, 1, 512] - - [515, 4531.54] + - [621, 4531.54] - - [5888, 2944, 1, 3328] - - [515, 8608.14] + - [621, 8608.14] - - [1856, 4288, 1, 256] - - [515, 6297.64] + - [621, 6297.64] - - [1024, 5056, 1, 128] - - [493, 3595.47] + - [599, 3595.47] - - [5056, 5056, 1, 3328] - - [509, 8559.26] + - [615, 8559.26] - - [1408, 5888, 1, 1280] - - [504, 6797.16] + - [610, 6797.16] - - [2368, 448, 1, 128] - - [493, 2815.0] + - [599, 2815.0] - - [2368, 6784, 1, 128] - - [497, 4782.08] + - [603, 4782.08] - - [1024, 3584, 1, 3328] - - [505, 8402.54] + - [611, 8402.54] - - [512, 48000, 1, 2048] - - [509, 8162.33] + - [615, 8162.33] - - [128, 448, 1, 1280] - - [480, 2903.59] + - [586, 2903.59] - - [256, 4288, 1, 3328] - - [510, 6346.04] + - [616, 6346.04] - - [5888, 1408, 1, 1280] - - [509, 8959.55] + - [615, 8959.55] - - [704, 1856, 1, 3328] - - [504, 6955.37] + - [610, 6955.37] - - [4, 1408, 1, 128] - - [524, 60.1747] + - [630, 60.1747] - - [1024, 2368, 1, 256] - - [511, 5927.88] + - [617, 5927.88] - - [64, 4, 1, 256] - - [529, 13.3129] + - [635, 13.3129] - - [1408, 1856, 1, 1280] - - [507, 8051.68] + - [613, 8051.68] - - [1408, 64, 1, 1280] - - [483, 3400.55] + - [589, 3400.55] - - [448, 1024, 1, 1280] - - [511, 5730.02] + - [617, 5730.02] - - [6144, 24000, 1, 2048] - - [515, 7738.4] + - [621, 7738.4] - - [4096, 32, 1, 4096] - - [453, 2381.53] + - [559, 2381.53] - - [256, 1408, 1, 3328] - - [511, 4844.88] + - [617, 4844.88] - - [5056, 5056, 1, 1280] - - [515, 9090.2] + - [621, 9090.2] - - [448, 5056, 1, 256] - - [521, 4961.28] + - [627, 4961.28] - - [704, 1856, 1, 1280] - - [507, 6456.54] + - [613, 6456.54] - - [128, 5056, 1, 128] - - [436, 2251.12] + - [542, 2251.12] - - [2368, 128, 1, 256] - - [504, 3403.37] + - [610, 3403.37] - - [1760, 6400, 1, 1760] - - [503, 8959.8] + - [609, 8959.8] - - [1856, 1408, 1, 128] - - [496, 3493.16] + - [602, 3493.16] - - [64, 5056, 1, 256] - - [505, 2582.32] + - [611, 2582.32] - - [6784, 256, 1, 3328] - - [503, 7323.64] + - [609, 7323.64] - - [6784, 4288, 1, 3328] - - [505, 8542.19] + - [611, 8542.19] - - [4288, 448, 1, 256] - - [521, 5030.6] + - [627, 5030.6] - - [64, 704, 1, 128] - - [438, 375.567] + - [544, 375.567] - - [1856, 2368, 1, 3328] - - [514, 6742.44] + - [620, 6742.44] - - [4288, 2944, 1, 1280] - - [515, 8578.27] + - [621, 8578.27] - - [704, 5056, 1, 1280] - - [511, 8014.55] + - [617, 8014.55] - - [2368, 704, 1, 3328] - - [510, 6544.41] + - [616, 6544.41] - - [256, 5888, 1, 256] - - [508, 5933.0] + - [614, 5933.0] - - [1856, 4288, 1, 3328] - - [514, 7410.82] + - [620, 7410.82] - - [256, 2944, 1, 256] - - [510, 5014.08] + - [616, 5014.08] - - [5888, 1024, 1, 256] - - [515, 8069.44] + - [621, 8069.44] - - [448, 64, 1, 1280] - - [490, 2057.28] + - [596, 2057.28] - - [3072, 64, 1, 1024] - - [470, 2145.52] + - [576, 2145.52] - - [3584, 4, 1, 1280] - - [462, 498.743] + - [568, 498.743] - - [16384, 3200, 1, 4096] - - [502, 6621.53] + - [608, 6621.53] - - [2944, 64, 1, 256] - - [510, 2554.89] + - [616, 2554.89] - - [128, 4, 1, 1280] - - [472, 87.2489] + - [578, 87.2489] - - [1408, 2944, 1, 256] - - [509, 8029.45] + - [615, 8029.45] - - [256, 1856, 1, 1280] - - [504, 6170.7] + - [610, 6170.7] - - [6784, 5056, 1, 3328] - - [513, 7134.29] + - [619, 7134.29] - - [5056, 5056, 1, 256] - - [521, 6246.9] + - [627, 6246.9] - - [1408, 6784, 1, 128] - - [498, 4329.55] + - [604, 4329.55] - - [64, 1024, 1, 1280] - - [480, 3206.75] + - [586, 3206.75] - - [2944, 4, 1, 256] - - [529, 333.58] + - [635, 333.58] - - [704, 5056, 1, 128] - - [493, 4085.52] + - [599, 4085.52] - - [4, 2368, 1, 1280] - - [530, 394.767] + - [636, 394.767] - - [2368, 2944, 1, 1280] - - [509, 8634.05] + - [615, 8634.05] - - [128, 3584, 1, 1280] - - [510, 6046.25] + - [616, 6046.25] - - [6784, 6784, 1, 1280] - - [515, 8847.51] + - [621, 8847.51] - - [1408, 4288, 1, 1280] - - [515, 8236.79] + - [621, 8236.79] - - [3584, 4288, 1, 1280] - - [510, 7399.98] + - [616, 7399.98] - - [2368, 704, 1, 1280] - - [503, 6754.5] + - [609, 6754.5] - - [5056, 4288, 1, 3328] - - [509, 8569.63] + - [615, 8569.63] - - [3584, 2368, 1, 3328] - - [514, 7942.48] + - [620, 7942.48] - - [64, 704, 1, 1280] - - [483, 2363.69] + - [589, 2363.69] - - [4288, 256, 1, 256] - - [511, 4591.9] + - [617, 4591.9] - - [2944, 128, 1, 128] - - [436, 1878.39] + - [542, 1878.39] - - [6144, 32, 1, 2560] - - [481, 3334.2] + - [587, 3334.2] - - [6784, 448, 1, 1280] - - [513, 7939.3] + - [619, 7939.3] - - [1408, 2944, 1, 128] - - [497, 4096.61] + - [603, 4096.61] - - [4288, 2944, 1, 256] - - [503, 8141.23] + - [609, 8141.23] - - [5888, 704, 1, 1280] - - [504, 7516.23] + - [610, 7516.23] - - [5056, 4, 1, 3328] - - [447, 552.509] + - [553, 552.509] - - [1856, 64, 1, 1280] - - [453, 3870.86] + - [559, 3870.86] - - [1760, 16, 1, 1760] - - [465, 2181.51] + - [571, 2181.51] - - [448, 5888, 1, 128] - - [498, 3371.1] + - [604, 3371.1] - - [5888, 64, 1, 3328] - - [478, 5319.48] + - [584, 5319.48] - - [2944, 256, 1, 3328] - - [510, 7122.4] + - [616, 7122.4] - - [1024, 64, 1, 128] - - [425, 595.882] + - [531, 595.882] - - [5056, 2368, 1, 1280] - - [504, 7778.29] + - [610, 7778.29] - - [448, 3584, 1, 1280] - - [509, 6500.62] + - [615, 6500.62] - - [6784, 5888, 1, 256] - - [509, 8918.68] + - [615, 8918.68] - - [704, 1024, 1, 128] - - [493, 2627.51] + - [599, 2627.51] - - [704, 128, 1, 1280] - - [480, 3408.59] + - [586, 3408.59] - - [4, 3584, 1, 128] - - [524, 140.821] + - [630, 140.821] - - [1408, 448, 1, 1280] - - [504, 5881.54] + - [610, 5881.54] - - [1024, 1408, 1, 256] - - [508, 5647.27] + - [614, 5647.27] - - [2368, 2368, 1, 3328] - - [502, 7688.83] + - [608, 7688.83] - - [1856, 6784, 1, 128] - - [493, 4705.95] + - [599, 4705.95] - - [5056, 704, 1, 3328] - - [513, 8198.98] + - [619, 8198.98] - - [1408, 1856, 1, 256] - - [515, 6340.05] + - [621, 6340.05] - - [1408, 704, 1, 3328] - - [507, 7599.65] + - [613, 7599.65] - - [2368, 5056, 1, 256] - - [515, 8242.85] + - [621, 8242.85] - - [1408, 256, 1, 1280] - - [510, 4879.26] + - [616, 4879.26] - - [3072, 128, 1, 1024] - - [479, 2525.52] + - [585, 2525.52] - - [3584, 2368, 1, 1280] - - [511, 8132.72] + - [617, 8132.72] - - [4288, 64, 1, 3328] - - [466, 5156.53] + - [572, 5156.53] - - [2368, 4, 1, 1280] - - [528, 482.75] + - [634, 482.75] - - [704, 5888, 1, 256] - - [518, 5398.75] + - [624, 5398.75] - - [6784, 2944, 1, 128] - - [494, 4748.99] + - [600, 4748.99] - - [2560, 1600, 1, 2560] - - [505, 7355.0] + - [611, 7355.0] - - [4288, 6784, 1, 3328] - - [502, 7409.41] + - [608, 7409.41] - - [2944, 256, 1, 256] - - [510, 5077.42] + - [616, 5077.42] - - [2944, 6784, 1, 3328] - - [515, 8068.05] + - [621, 8068.05] - - [704, 1408, 1, 3328] - - [510, 7239.43] + - [616, 7239.43] - - [6144, 5984, 1, 2048] - - [509, 7176.07] + - [615, 7176.07] - - [3584, 704, 1, 3328] - - [515, 6642.86] + - [621, 6642.86] - - [2944, 256, 1, 128] - - [494, 2644.54] + - [600, 2644.54] - - [6784, 4, 1, 1280] - - [526, 402.487] + - [632, 402.487] - - [1024, 64, 1, 1280] - - [480, 2602.03] + - [586, 2602.03] - - [2048, 1600, 1, 512] - - [507, 5592.5] + - [613, 5592.5] - - [448, 4288, 1, 256] - - [505, 6128.99] + - [611, 6128.99] - - [64, 3584, 1, 3328] - - [446, 5534.93] + - [552, 5534.93] - - [1856, 4288, 1, 128] - - [496, 4400.11] + - [602, 4400.11] - - [704, 2368, 1, 1280] - - [521, 5735.02] + - [627, 5735.02] - - [1856, 2368, 1, 1280] - - [518, 6482.4] + - [624, 6482.4] - - [2368, 128, 1, 3328] - - [491, 4717.32] + - [597, 4717.32] - - [2944, 128, 1, 256] - - [518, 3276.9] + - [624, 3276.9] - - [448, 1408, 1, 256] - - [510, 4852.28] + - [616, 4852.28] - - [1856, 4288, 1, 1280] - - [505, 8132.96] + - [611, 8132.96] - - [64, 5056, 1, 3328] - - [481, 5097.06] + - [587, 5097.06] - - [4, 704, 1, 256] - - [528, 128.831] + - [634, 128.831] - - [1024, 448, 1, 128] - - [493, 1816.94] + - [599, 1816.94] - - [704, 4, 1, 1280] - - [529, 328.976] + - [635, 328.976] - - [704, 256, 1, 128] - - [497, 876.569] + - [603, 876.569] - - [704, 2944, 1, 128] - - [497, 3734.47] + - [603, 3734.47] - - [1408, 1024, 1, 1280] - - [505, 7224.85] + - [611, 7224.85] - - [704, 6784, 1, 256] - - [509, 7354.77] + - [615, 7354.77] - - [6784, 704, 1, 256] - - [505, 6012.28] + - [611, 6012.28] - - [5056, 1408, 1, 128] - - [498, 4311.28] + - [604, 4311.28] - - [2048, 7000, 1, 2048] - - [509, 7232.07] + - [615, 7232.07] - - [256, 3584, 1, 3328] - - [513, 7006.0] + - [619, 7006.0] - - [4, 5888, 1, 3328] - - [531, 534.612] + - [637, 534.612] - - [128, 1408, 1, 128] - - [423, 1177.07] + - [529, 1177.07] - - [3584, 4288, 1, 3328] - - [515, 7135.0] + - [621, 7135.0] - - [5888, 1856, 1, 1280] - - [503, 8395.03] + - [609, 8395.03] - - [256, 1408, 1, 256] - - [504, 3977.46] + - [610, 3977.46] - - [5056, 64, 1, 1280] - - [504, 4257.78] + - [610, 4257.78] - - [1024, 704, 1, 256] - - [504, 5036.93] + - [610, 5036.93] - - [448, 128, 1, 128] - - [425, 533.533] + - [531, 533.533] - - [2368, 3584, 1, 1280] - - [509, 8272.43] + - [615, 8272.43] - - [2368, 6784, 1, 1280] - - [502, 8288.24] + - [608, 8288.24] - - [1856, 4, 1, 1280] - - [442, 464.1] + - [548, 464.1] - - [448, 448, 1, 256] - - [504, 3058.45] + - [610, 3058.45] - - [2944, 3584, 1, 3328] - - [509, 8557.63] + - [615, 8557.63] - - [7680, 32, 1, 2560] - - [481, 3729.03] + - [587, 3729.03] - - [128, 4288, 1, 128] - - [424, 2116.2] + - [530, 2116.2] - - [256, 256, 1, 3328] - - [480, 4051.06] + - [586, 4051.06] - - [128, 1024, 1, 3328] - - [453, 5139.21] + - [559, 5139.21] - - [4, 1408, 1, 3328] - - [472, 502.871] + - [578, 502.871] - - [6784, 2944, 1, 256] - - [503, 8446.06] + - [609, 8446.06] - - [64, 1856, 1, 1280] - - [445, 3870.86] + - [551, 3870.86] - - [6784, 64, 1, 128] - - [493, 1877.62] + - [599, 1877.62] - - [4288, 2368, 1, 3328] - - [513, 8419.4] + - [619, 8419.4] - - [1856, 2368, 1, 256] - - [507, 6887.48] + - [613, 6887.48] - - [3584, 256, 1, 128] - - [497, 2496.71] + - [603, 2496.71] - - [3584, 6784, 1, 3328] - - [509, 7626.18] + - [615, 7626.18] - - [256, 1024, 1, 256] - - [510, 3095.53] + - [616, 3095.53] - - [4, 6784, 1, 3328] - - [472, 589.274] + - [578, 589.274] - - [1024, 5888, 1, 3328] - - [509, 7794.35] + - [615, 7794.35] - - [1024, 128, 1, 1280] - - [482, 3130.18] + - [588, 3130.18] - - [3072, 32, 1, 1024] - - [469, 1675.59] + - [575, 1675.59] - - [6144, 24000, 1, 2560] - - [509, 7256.14] + - [615, 7256.14] - - [5056, 4288, 1, 1280] - - [507, 8349.03] + - [613, 8349.03] - - [5888, 64, 1, 256] - - [456, 2593.35] + - [562, 2593.35] - - [6784, 1856, 1, 3328] - - [503, 8087.38] + - [609, 8087.38] - - [1408, 5056, 1, 1280] - - [505, 7802.63] + - [611, 7802.63] - - [1856, 256, 1, 1280] - - [510, 6150.73] + - [616, 6150.73] - - [64, 5888, 1, 3328] - - [477, 5301.49] + - [583, 5301.49] - - [2368, 2368, 1, 1280] - - [507, 8233.43] + - [613, 8233.43] - - [2944, 5888, 1, 128] - - [500, 3745.51] + - [606, 3745.51] - - [704, 5888, 1, 1280] - - [505, 8245.04] + - [611, 8245.04] - - [2368, 3584, 1, 128] - - [497, 4523.43] + - [603, 4523.43] - - [1856, 5056, 1, 128] - - [494, 4498.08] + - [600, 4498.08] - - [704, 1024, 1, 1280] - - [518, 5479.59] + - [624, 5479.59] - - [448, 256, 1, 3328] - - [461, 5048.8] + - [567, 5048.8] - - [448, 1856, 1, 128] - - [494, 2936.92] + - [600, 2936.92] - - [8192, 3200, 1, 2048] - - [503, 6713.12] + - [609, 6713.12] - - [128, 1024, 1, 128] - - [439, 998.744] + - [545, 998.744] - - [2944, 4, 1, 128] - - [524, 98.7471] + - [630, 98.7471] - - [1024, 704, 1, 1280] - - [510, 5897.0] + - [616, 5897.0] - - [128, 5888, 1, 256] - - [510, 5014.08] + - [616, 5014.08] - - [1024, 5056, 1, 1280] - - [509, 8857.81] + - [615, 8857.81] - - [4288, 1024, 1, 256] - - [515, 6195.39] + - [621, 6195.39] - - [2944, 2368, 1, 128] - - [493, 4442.23] + - [599, 4442.23] - - [704, 704, 1, 3328] - - [510, 6764.4] + - [616, 6764.4] - - [704, 1408, 1, 1280] - - [511, 7383.58] + - [617, 7383.58] - - [5888, 448, 1, 1280] - - [509, 7299.49] + - [615, 7299.49] - - [3584, 256, 1, 3328] - - [507, 7061.72] + - [613, 7061.72] - - [704, 5888, 1, 3328] - - [511, 8142.42] + - [617, 8142.42] - - [704, 1856, 1, 128] - - [497, 3139.14] + - [603, 3139.14] - - [448, 448, 1, 3328] - - [475, 5063.34] + - [581, 5063.34] - - [4, 4288, 1, 128] - - [525, 64.9775] + - [631, 64.9775] - - [128, 704, 1, 1280] - - [445, 3400.55] + - [551, 3400.55] - - [3584, 2944, 1, 256] - - [515, 7982.14] + - [621, 7982.14] - - [3584, 4, 1, 128] - - [524, 105.318] + - [630, 105.318] - - [1856, 128, 1, 3328] - - [476, 5442.19] + - [582, 5442.19] - - [4, 64, 1, 1280] - - [530, 42.3268] + - [636, 42.3268] - - [2944, 448, 1, 128] - - [493, 2926.95] + - [599, 2926.95] - - [128, 2944, 1, 1280] - - [504, 5109.69] + - [610, 5109.69] - - [64, 64, 1, 3328] - - [472, 1252.99] + - [578, 1252.99] - - [448, 2944, 1, 1280] - - [513, 6684.47] + - [619, 6684.47] - - [512, 24000, 1, 2048] - - [509, 7939.03] + - [615, 7939.03] - - [128, 256, 1, 3328] - - [490, 3276.9] + - [596, 3276.9] - - [1408, 5056, 1, 3328] - - [515, 8959.21] + - [621, 8959.21] - - [1856, 1856, 1, 3328] - - [505, 8006.17] + - [611, 8006.17] - - [3584, 128, 1, 256] - - [510, 4292.52] + - [616, 4292.52] - - [2560, 800, 1, 2560] - - [505, 6262.48] + - [611, 6262.48] - - [448, 1408, 1, 3328] - - [521, 4997.35] + - [627, 4997.35] - - [2368, 2368, 1, 256] - - [523, 4978.94] + - [629, 4978.94] - - [4288, 4288, 1, 1280] - - [502, 8617.78] + - [608, 8617.78] - - [64, 448, 1, 1280] - - [448, 2057.28] + - [554, 2057.28] - - [5888, 1024, 1, 1280] - - [520, 6848.17] + - [626, 6848.17] - - [1408, 4288, 1, 256] - - [503, 7077.01] + - [609, 7077.01] - - [448, 4, 1, 256] - - [528, 84.4294] + - [634, 84.4294] - - [5888, 448, 1, 128] - - [497, 3493.91] + - [603, 3493.91] - - [512, 48000, 1, 2560] - - [515, 8960.13] + - [621, 8960.13] - - [35, 8457, 1, 1760] - - [417, 3934.78] + - [523, 3934.78] - - [704, 6784, 1, 3328] - - [502, 8180.88] + - [608, 8180.88] - - [2560, 6400, 1, 2560] - - [503, 7822.24] + - [609, 7822.24] - - [5056, 1024, 1, 1280] - - [505, 8357.38] + - [611, 8357.38] - - [448, 5888, 1, 3328] - - [509, 7505.28] + - [615, 7505.28] - - [128, 4, 1, 128] - - [524, 0.662251] + - [630, 0.662251] - - [1024, 2944, 1, 1280] - - [509, 8406.24] + - [615, 8406.24] - - [5056, 5888, 1, 1280] - - [509, 8819.76] + - [615, 8819.76] - - [4288, 5888, 1, 128] - - [494, 3522.32] + - [600, 3522.32] - - [256, 3584, 1, 256] - - [505, 5883.89] + - [611, 5883.89] - - [1408, 3584, 1, 128] - - [493, 4283.41] + - [599, 4283.41] - - [256, 2944, 1, 3328] - - [513, 5670.63] + - [619, 5670.63] - - [448, 3584, 1, 128] - - [497, 3171.72] + - [603, 3171.72] - - [5888, 2944, 1, 1280] - - [515, 8198.86] + - [621, 8198.86] - - [4, 6784, 1, 1280] - - [462, 553.896] + - [568, 553.896] - - [2368, 5888, 1, 128] - - [493, 4787.32] + - [599, 4787.32] - - [8448, 16, 1, 2816] - - [452, 2452.63] + - [558, 2452.63] - - [64, 2944, 1, 128] - - [425, 1376.66] + - [531, 1376.66] - - [2368, 4, 1, 256] - - [447, 278.177] + - [553, 278.177] - - [3584, 5888, 1, 256] - - [523, 6233.66] + - [629, 6233.66] - - [2368, 1024, 1, 128] - - [494, 3781.51] + - [600, 3781.51] - - [2368, 704, 1, 128] - - [494, 3198.32] + - [600, 3198.32] - - [3584, 2944, 1, 1280] - - [505, 8045.68] + - [611, 8045.68] - - [3584, 2368, 1, 128] - - [494, 4188.57] + - [600, 4188.57] - - [5056, 704, 1, 128] - - [497, 4019.21] + - [603, 4019.21] - - [448, 2368, 1, 128] - - [499, 2522.21] + - [605, 2522.21] - - [5056, 1408, 1, 3328] - - [507, 8349.93] + - [613, 8349.93] - - [1408, 704, 1, 256] - - [513, 4741.42] + - [619, 4741.42] - - [6784, 1024, 1, 3328] - - [515, 8769.5] + - [621, 8769.5] - - [6784, 2944, 1, 3328] - - [512, 7319.74] + - [618, 7319.74] - - [2944, 5056, 1, 3328] - - [502, 8889.76] + - [608, 8889.76] - - [1856, 1856, 1, 256] - - [505, 6309.84] + - [611, 6309.84] - - [1024, 5888, 1, 128] - - [496, 3759.6] + - [602, 3759.6] - - [6784, 2368, 1, 1280] - - [505, 8298.4] + - [611, 8298.4] - - [256, 4, 1, 128] - - [524, 7.10171] + - [630, 7.10171] - - [4288, 5888, 1, 1280] - - [509, 8365.28] + - [615, 8365.28] - - [4288, 4288, 1, 256] - - [509, 6513.78] + - [615, 6513.78] - - [8448, 32, 1, 2816] - - [480, 4257.74] + - [586, 4257.74] - - [448, 2944, 1, 3328] - - [513, 6875.62] + - [619, 6875.62] - - [5888, 4, 1, 128] - - [524, 163.94] + - [630, 163.94] - - [4288, 1856, 1, 1280] - - [509, 8402.91] + - [615, 8402.91] - - [1856, 2944, 1, 3328] - - [509, 6612.21] + - [615, 6612.21] - - [256, 6784, 1, 3328] - - [510, 7358.7] + - [616, 7358.7] - - [64, 5888, 1, 256] - - [504, 3359.05] + - [610, 3359.05] - - [256, 5056, 1, 128] - - [497, 2489.21] + - [603, 2489.21] - - [5056, 1024, 1, 256] - - [515, 8077.87] + - [621, 8077.87] - - [704, 64, 1, 3328] - - [459, 3288.4] + - [565, 3288.4] - - [5056, 1856, 1, 3328] - - [513, 8171.13] + - [619, 8171.13] - - [4, 2944, 1, 3328] - - [472, 546.843] + - [578, 546.843] - - [4, 5056, 1, 256] - - [447, 378.561] + - [553, 378.561] - - [1856, 1408, 1, 256] - - [515, 6320.88] + - [621, 6320.88] - - [8448, 12000, 1, 2816] - - [513, 7365.87] + - [619, 7365.87] - - [6784, 128, 1, 3328] - - [510, 6366.57] + - [616, 6366.57] - - [4288, 1408, 1, 128] - - [493, 4451.7] + - [599, 4451.7] - - [1856, 5888, 1, 3328] - - [511, 8619.76] + - [617, 8619.76] - - [4288, 5056, 1, 256] - - [515, 7289.05] + - [621, 7289.05] - - [1408, 128, 1, 1280] - - [453, 4291.15] + - [559, 4291.15] - - [4096, 800, 1, 1024] - - [504, 5867.89] + - [610, 5867.89] - - [5056, 256, 1, 3328] - - [510, 7527.61] + - [616, 7527.61] - - [704, 704, 1, 256] - - [510, 4417.85] + - [616, 4417.85] - - [1024, 5888, 1, 1280] - - [515, 8674.57] + - [621, 8674.57] - - [6784, 2368, 1, 128] - - [493, 4724.08] + - [599, 4724.08] - - [4, 5056, 1, 1280] - - [462, 540.307] + - [568, 540.307] - - [256, 64, 1, 1280] - - [464, 1515.38] + - [570, 1515.38] - - [128, 1856, 1, 1280] - - [504, 4574.21] + - [610, 4574.21] - - [1856, 1024, 1, 1280] - - [509, 7741.61] + - [615, 7741.61] - - [6784, 4288, 1, 1280] - - [515, 8521.29] + - [621, 8521.29] - - [2560, 64, 1, 2560] - - [446, 3504.7] + - [552, 3504.7] - - [1856, 1856, 1, 1280] - - [505, 7779.31] + - [611, 7779.31] - - [4096, 400, 1, 1024] - - [515, 4157.81] + - [621, 4157.81] - - [3072, 24000, 1, 1024] - - [515, 8663.45] + - [621, 8663.45] - - [128, 4288, 1, 3328] - - [461, 5674.23] + - [567, 5674.23] - - [4, 2368, 1, 3328] - - [472, 525.48] + - [578, 525.48] - - [5888, 1856, 1, 128] - - [497, 4099.74] + - [603, 4099.74] - - [448, 704, 1, 1280] - - [510, 4309.47] + - [616, 4309.47] - - [128, 5056, 1, 1280] - - [453, 5068.46] + - [559, 5068.46] - - [1024, 448, 1, 3328] - - [513, 6077.82] + - [619, 6077.82] - - [1856, 704, 1, 1280] - - [521, 6257.49] + - [627, 6257.49] - - [5056, 3584, 1, 128] - - [494, 4598.52] + - [600, 4598.52] - - [5888, 5888, 1, 3328] - - [515, 8058.25] + - [621, 8058.25] - - [6784, 1024, 1, 256] - - [515, 5120.99] + - [621, 5120.99] - - [2944, 2368, 1, 256] - - [506, 6523.03] + - [612, 6523.03] - - [256, 448, 1, 256] - - [456, 1816.94] + - [562, 1816.94] - - [5056, 5888, 1, 3328] - - [508, 6722.41] + - [614, 6722.41] - - [1856, 1024, 1, 256] - - [515, 6632.31] + - [621, 6632.31] - - [512, 48000, 1, 1536] - - [509, 8556.01] + - [615, 8556.01] - - [3584, 448, 1, 1280] - - [504, 6567.09] + - [610, 6567.09] - - [8448, 5984, 1, 2816] - - [509, 8990.66] + - [615, 8990.66] - - [448, 5888, 1, 256] - - [509, 6220.47] + - [615, 6220.47] - - [704, 64, 1, 128] - - [422, 450.66] + - [528, 450.66] - - [1408, 6784, 1, 3328] - - [502, 8478.68] + - [608, 8478.68] - - [448, 1024, 1, 128] - - [501, 1844.33] + - [607, 1844.33] - - [4288, 704, 1, 128] - - [497, 3895.26] + - [603, 3895.26] - - [128, 1856, 1, 128] - - [428, 1456.46] + - [534, 1456.46] - - [448, 2368, 1, 3328] - - [507, 5538.04] + - [613, 5538.04] - - [5056, 64, 1, 128] - - [493, 1648.94] + - [599, 1648.94] - - [5056, 2944, 1, 256] - - [509, 8230.87] + - [615, 8230.87] - - [6784, 5888, 1, 128] - - [493, 4873.19] + - [599, 4873.19] - - [1024, 700, 1, 512] - - [507, 4445.37] + - [613, 4445.37] - - [704, 1024, 1, 256] - - [505, 4707.99] + - [611, 4707.99] - - [1024, 4, 1, 256] - - [447, 174.863] + - [553, 174.863] - - [2944, 704, 1, 128] - - [497, 3483.42] + - [603, 3483.42] - - [128, 6784, 1, 1280] - - [505, 6522.93] + - [611, 6522.93] - - [1408, 3584, 1, 3328] - - [509, 8673.59] + - [615, 8673.59] - - [2368, 6784, 1, 256] - - [505, 7941.76] + - [611, 7941.76] - - [5056, 1408, 1, 1280] - - [509, 8801.01] + - [615, 8801.01] - - [256, 256, 1, 128] - - [434, 551.982] + - [540, 551.982] - - [5056, 4288, 1, 128] - - [501, 3793.64] + - [607, 3793.64] - - [1408, 1856, 1, 128] - - [493, 3067.74] + - [599, 3067.74] - - [1408, 5888, 1, 3328] - - [509, 9148.97] + - [615, 9148.97] - - [1856, 256, 1, 256] - - [505, 4319.52] + - [611, 4319.52] - - [6784, 6784, 1, 256] - - [505, 7668.53] + - [611, 7668.53] - - [64, 256, 1, 128] - - [439, 131.172] + - [545, 131.172] - - [4288, 2368, 1, 128] - - [494, 4582.99] + - [600, 4582.99] - - [256, 4288, 1, 1280] - - [504, 6058.61] + - [610, 6058.61] - - [2368, 2944, 1, 256] - - [509, 8016.07] + - [615, 8016.07] - - [4, 1856, 1, 256] - - [526, 252.832] + - [632, 252.832] - - [3584, 1856, 1, 1280] - - [505, 7760.24] + - [611, 7760.24] - - [6784, 6784, 1, 128] - - [494, 4970.14] + - [600, 4970.14] - - [256, 1856, 1, 128] - - [500, 1580.59] + - [606, 1580.59] - - [704, 64, 1, 1280] - - [489, 2556.47] + - [595, 2556.47] - - [5888, 5056, 1, 256] - - [509, 8216.67] + - [615, 8216.67] - - [8448, 48000, 1, 2816] - - [515, 4082.89] + - [621, 4082.89] - - [3584, 448, 1, 256] - - [509, 5518.92] + - [615, 5518.92] - - [448, 4288, 1, 128] - - [497, 3415.25] + - [603, 3415.25] - - [7680, 64, 1, 2560] - - [458, 5162.1] + - [564, 5162.1] - - [256, 6784, 1, 256] - - [509, 6272.62] + - [615, 6272.62] - - [1408, 4288, 1, 128] - - [497, 4343.63] + - [603, 4343.63] - - [2944, 704, 1, 3328] - - [504, 7679.71] + - [610, 7679.71] - - [128, 448, 1, 256] - - [444, 1422.59] + - [550, 1422.59] - - [5056, 256, 1, 1280] - - [511, 5052.39] + - [617, 5052.39] - - [2560, 32, 1, 2560] - - [467, 3106.07] + - [573, 3106.07] - - [3584, 3584, 1, 256] - - [515, 8260.57] + - [621, 8260.57] - - [448, 1408, 1, 128] - - [493, 2397.38] + - [599, 2397.38] - - [128, 256, 1, 1280] - - [448, 2340.67] + - [554, 2340.67] - - [3584, 5056, 1, 256] - - [515, 7347.56] + - [621, 7347.56] - - [6784, 128, 1, 256] - - [505, 5591.1] + - [611, 5591.1] - - [4288, 4, 1, 256] - - [447, 354.206] + - [553, 354.206] - - [704, 448, 1, 256] - - [510, 3492.33] + - [616, 3492.33] - - [2944, 2368, 1, 1280] - - [517, 6661.71] + - [623, 6661.71] - - [448, 64, 1, 3328] - - [489, 3058.45] + - [595, 3058.45] - - [1408, 3584, 1, 256] - - [515, 7966.59] + - [621, 7966.59] - - [3584, 4, 1, 3328] - - [528, 605.559] + - [634, 605.559] - - [6784, 3584, 1, 256] - - [505, 7525.41] + - [611, 7525.41] - - [256, 128, 1, 128] - - [437, 276.041] + - [543, 276.041] - - [704, 1408, 1, 128] - - [494, 3109.85] + - [600, 3109.85] - - [4, 2368, 1, 256] - - [528, 283.375] + - [634, 283.375] - - [4288, 128, 1, 1280] - - [510, 5132.65] + - [616, 5132.65] - - [128, 1408, 1, 256] - - [504, 2733.35] + - [610, 2733.35] - - [4, 2944, 1, 256] - - [526, 314.127] + - [632, 314.127] - - [64, 128, 1, 3328] - - [474, 1514.71] + - [580, 1514.71] - - [5056, 2368, 1, 128] - - [498, 3449.17] + - [604, 3449.17] - - [2944, 2944, 1, 3328] - - [502, 8169.03] + - [608, 8169.03] - - [5056, 6784, 1, 256] - - [522, 5792.77] + - [628, 5792.77] - - [1856, 3584, 1, 128] - - [499, 4213.5] + - [605, 4213.5] - - [128, 2944, 1, 128] - - [423, 1970.46] + - [529, 1970.46] - - [35, 8457, 1, 2560] - - [418, 3525.15] + - [524, 3525.15] - - [1024, 704, 1, 3328] - - [504, 6784.99] + - [610, 6784.99] - - [6784, 448, 1, 256] - - [513, 6544.88] + - [619, 6544.88] - - [3584, 6784, 1, 128] - - [493, 4623.6] + - [599, 4623.6] - - [128, 4288, 1, 256] - - [507, 3606.6] + - [613, 3606.6] - - [704, 448, 1, 3328] - - [504, 4478.01] + - [610, 4478.01] - - [128, 128, 1, 3328] - - [489, 2177.65] + - [595, 2177.65] - - [5056, 1856, 1, 256] - - [523, 5608.72] + - [629, 5608.72] - - [4608, 5984, 1, 1536] - - [512, 7859.85] + - [618, 7859.85] - - [256, 128, 1, 256] - - [448, 998.744] + - [554, 998.744] - - [1760, 3200, 1, 1760] - - [505, 8179.64] + - [611, 8179.64] - - [1024, 1856, 1, 256] - - [515, 6143.27] + - [621, 6143.27] - - [4096, 1600, 1, 1024] - - [523, 5851.52] + - [629, 5851.52] - - [4288, 64, 1, 128] - - [428, 1372.26] + - [534, 1372.26] - - [256, 448, 1, 3328] - - [467, 4795.1] + - [573, 4795.1] - - [1408, 6784, 1, 1280] - - [509, 8426.5] + - [615, 8426.5] - - [3584, 3584, 1, 1280] - - [509, 7556.56] + - [615, 7556.56] - - [7680, 24000, 1, 2560] - - [502, 5019.19] + - [608, 5019.19] - - [64, 2368, 1, 1280] - - [453, 4061.8] + - [559, 4061.8] - - [448, 2368, 1, 1280] - - [504, 5928.77] + - [610, 5928.77] - - [4608, 48000, 1, 1536] - - [509, 6937.4] + - [615, 6937.4] - - [5888, 5888, 1, 128] - - [494, 3744.0] + - [600, 3744.0] - - [64, 6784, 1, 3328] - - [504, 5988.72] + - [610, 5988.72] - - [2944, 256, 1, 1280] - - [510, 6717.97] + - [616, 6717.97] - - [2048, 16, 1, 2048] - - [462, 1210.58] + - [568, 1210.58] - - [256, 2368, 1, 128] - - [497, 1936.07] + - [603, 1936.07] - - [5056, 2368, 1, 3328] - - [515, 8875.63] + - [621, 8875.63] - - [2944, 4288, 1, 256] - - [509, 8063.24] + - [615, 8063.24] - - [1408, 3584, 1, 1280] - - [505, 8197.07] + - [611, 8197.07] - - [2368, 64, 1, 256] - - [504, 2365.79] + - [610, 2365.79] - - [64, 448, 1, 3328] - - [490, 3027.4] + - [596, 3027.4] - - [704, 128, 1, 3328] - - [461, 4452.19] + - [567, 4452.19] - - [8192, 1600, 1, 2048] - - [509, 7229.93] + - [615, 7229.93] - - [1856, 704, 1, 256] - - [511, 5545.45] + - [617, 5545.45] - - [4, 4288, 1, 1280] - - [462, 523.825] + - [568, 523.825] - - [1408, 448, 1, 3328] - - [516, 4789.4] + - [622, 4789.4] - - [1024, 4, 1, 3328] - - [442, 504.223] + - [548, 504.223] - - [512, 24000, 1, 2560] - - [515, 8903.62] + - [621, 8903.62] - - [2368, 6784, 1, 3328] - - [515, 8311.14] + - [621, 8311.14] - - [1856, 1408, 1, 1280] - - [505, 8160.11] + - [611, 8160.11] - - [1856, 448, 1, 1280] - - [507, 6243.07] + - [613, 6243.07] - - [6784, 704, 1, 128] - - [493, 4069.05] + - [599, 4069.05] - - [4, 4, 1, 256] - - [462, 0.842029] + - [568, 0.842029] - - [128, 5888, 1, 128] - - [493, 2328.02] + - [599, 2328.02] - - [1408, 5888, 1, 256] - - [504, 6986.91] + - [610, 6986.91] - - [704, 2944, 1, 1280] - - [505, 7905.03] + - [611, 7905.03] - - [4288, 64, 1, 1280] - - [480, 3828.27] + - [586, 3828.27] - - [256, 64, 1, 256] - - [455, 655.46] + - [561, 655.46] - - [704, 1856, 1, 256] - - [513, 5444.37] + - [619, 5444.37] - - [704, 6784, 1, 128] - - [493, 4319.77] + - [599, 4319.77] - - [3584, 704, 1, 1280] - - [513, 7726.43] + - [619, 7726.43] - - [256, 128, 1, 1280] - - [448, 2184.63] + - [554, 2184.63] - - [5888, 2368, 1, 256] - - [515, 8192.69] + - [621, 8192.69] - - [256, 2368, 1, 1280] - - [510, 5675.54] + - [616, 5675.54] - - [2944, 6784, 1, 128] - - [498, 4248.35] + - [604, 4248.35] - - [3584, 448, 1, 3328] - - [509, 6560.77] + - [615, 6560.77] - - [1408, 4, 1, 256] - - [527, 176.79] + - [633, 176.79] - - [704, 2368, 1, 3328] - - [510, 7085.31] + - [616, 7085.31] - - [2944, 448, 1, 256] - - [506, 3412.0] + - [612, 3412.0] - - [1856, 448, 1, 128] - - [494, 2748.82] + - [600, 2748.82] - - [4288, 4, 1, 3328] - - [462, 553.648] + - [568, 553.648] - - [2368, 128, 1, 1280] - - [483, 4173.65] + - [589, 4173.65] - - [256, 5888, 1, 128] - - [498, 2860.98] + - [604, 2860.98] - - [64, 6784, 1, 256] - - [511, 3637.18] + - [617, 3637.18] - - [64, 5056, 1, 1280] - - [510, 4289.53] + - [616, 4289.53] - - [4, 6784, 1, 128] - - [524, 160.906] + - [630, 160.906] - - [2048, 3200, 1, 512] - - [511, 6927.09] + - [617, 6927.09] - - [2944, 2944, 1, 1280] - - [503, 6267.85] + - [609, 6267.85] - - [5056, 448, 1, 3328] - - [504, 7400.36] + - [610, 7400.36] - - [4, 3584, 1, 1280] - - [462, 499.83] + - [568, 499.83] - - [1408, 128, 1, 128] - - [439, 1037.36] + - [545, 1037.36] - - [6784, 704, 1, 3328] - - [510, 7633.95] + - [616, 7633.95] - - [128, 64, 1, 1280] - - [462, 1170.39] + - [568, 1170.39] - - [2368, 256, 1, 1280] - - [510, 5609.89] + - [616, 5609.89] - - [4, 448, 1, 3328] - - [530, 358.5] + - [636, 358.5] - - [5888, 4288, 1, 128] - - [498, 4521.74] + - [604, 4521.74] - - [4, 5888, 1, 256] - - [462, 353.933] + - [568, 353.933] - - [1408, 2944, 1, 3328] - - [503, 8951.41] + - [609, 8951.41] - - [3584, 704, 1, 128] - - [493, 3395.41] + - [599, 3395.41] - - [4608, 12000, 1, 1536] - - [502, 6609.99] + - [608, 6609.99] - - [64, 1024, 1, 256] - - [448, 1588.85] + - [554, 1588.85] - - [5056, 5056, 1, 128] - - [493, 4080.81] + - [599, 4080.81] - - [2368, 448, 1, 1280] - - [504, 5423.04] + - [610, 5423.04] - - [128, 3584, 1, 256] - - [510, 4705.25] + - [616, 4705.25] - - [704, 448, 1, 1280] - - [507, 3961.07] + - [613, 3961.07] - - [8192, 800, 1, 2048] - - [505, 6306.36] + - [611, 6306.36] - - [448, 5056, 1, 128] - - [497, 3709.56] + - [603, 3709.56] - - [256, 4, 1, 1280] - - [529, 163.94] + - [635, 163.94] - - [5056, 3584, 1, 256] - - [502, 7008.34] + - [608, 7008.34] - - [2368, 4, 1, 3328] - - [462, 496.366] + - [568, 496.366] - - [1408, 5056, 1, 128] - - [497, 4175.37] + - [603, 4175.37] - - [2944, 3584, 1, 128] - - [493, 4659.79] + - [599, 4659.79] - - [3584, 2368, 1, 256] - - [515, 5851.87] + - [621, 5851.87] - - [128, 3584, 1, 3328] - - [505, 6105.04] + - [611, 6105.04] - - [128, 1024, 1, 1280] - - [445, 3848.09] + - [551, 3848.09] - - [8448, 24000, 1, 2816] - - [515, 5128.64] + - [621, 5128.64] - - [64, 704, 1, 256] - - [448, 1253.83] + - [554, 1253.83] - - [4288, 256, 1, 1280] - - [504, 5625.86] + - [610, 5625.86] - - [3584, 3584, 1, 3328] - - [509, 8206.15] + - [615, 8206.15] - - [4, 704, 1, 128] - - [524, 29.5484] + - [630, 29.5484] - - [5888, 6784, 1, 256] - - [511, 8248.75] + - [617, 8248.75] - - [4288, 2944, 1, 3328] - - [509, 8657.12] + - [615, 8657.12] - - [2944, 64, 1, 128] - - [428, 1240.7] + - [534, 1240.7] - - [1024, 128, 1, 3328] - - [453, 4433.1] + - [559, 4433.1] - - [1024, 16, 1, 500000] - - [416, 2571.15] + - [522, 2571.15] - - [4288, 128, 1, 3328] - - [453, 5716.85] + - [559, 5716.85] - - [7680, 128, 1, 2560] - - [451, 5488.1] + - [557, 5488.1] - - [256, 5056, 1, 1280] - - [511, 6380.06] + - [617, 6380.06] - - [1408, 256, 1, 128] - - [497, 1633.83] + - [603, 1633.83] - - [2944, 5888, 1, 3328] - - [506, 7849.02] + - [612, 7849.02] - - [6784, 5888, 1, 1280] - - [515, 9047.72] + - [621, 9047.72] - - [2048, 800, 1, 512] - - [510, 4841.17] + - [616, 4841.17] - - [704, 128, 1, 256] - - [455, 1567.27] + - [561, 1567.27] - - [5888, 4288, 1, 1280] - - [509, 7982.93] + - [615, 7982.93] - - [1024, 24000, 1, 2048] - - [511, 5774.4] + - [617, 5774.4] - - [448, 256, 1, 1280] - - [445, 3707.19] + - [551, 3707.19] - - [5888, 3584, 1, 128] - - [498, 3804.5] + - [604, 3804.5] - - [1024, 2944, 1, 128] - - [493, 3308.36] + - [599, 3308.36] - - [5056, 4, 1, 1280] - - [526, 469.062] + - [632, 469.062] - - [256, 1408, 1, 1280] - - [504, 4899.99] + - [610, 4899.99] - - [3072, 16, 1, 1024] - - [462, 1233.72] + - [568, 1233.72] - - [704, 3584, 1, 128] - - [493, 3919.53] + - [599, 3919.53] - - [5888, 448, 1, 3328] - - [523, 6095.71] + - [629, 6095.71] - - [2368, 4288, 1, 1280] - - [505, 8338.4] + - [611, 8338.4] - - [4288, 2944, 1, 128] - - [497, 3946.6] + - [603, 3946.6] - - [1024, 6784, 1, 3328] - - [511, 7494.38] + - [617, 7494.38] - - [128, 2368, 1, 256] - - [510, 2895.42] + - [616, 2895.42] - - [6784, 64, 1, 3328] - - [504, 5964.99] + - [610, 5964.99] - - [5056, 2944, 1, 3328] - - [515, 6605.63] + - [621, 6605.63] - - [448, 128, 1, 256] - - [448, 1339.52] + - [554, 1339.52] - - [2944, 3584, 1, 256] - - [511, 7165.66] + - [617, 7165.66] - - [1408, 1408, 1, 3328] - - [515, 8332.96] + - [621, 8332.96] - - [1856, 128, 1, 1280] - - [510, 4498.43] + - [616, 4498.43] - - [3584, 3584, 1, 128] - - [494, 4000.11] + - [600, 4000.11] - - [64, 3584, 1, 256] - - [521, 2383.23] + - [627, 2383.23] - - [1408, 4, 1, 3328] - - [472, 423.008] + - [578, 423.008] - - [128, 2944, 1, 3328] - - [477, 5430.03] + - [583, 5430.03] - - [3584, 704, 1, 256] - - [510, 6154.09] + - [616, 6154.09] - - [2944, 448, 1, 3328] - - [510, 6507.82] + - [616, 6507.82] - - [3584, 1408, 1, 3328] - - [515, 8829.73] + - [621, 8829.73] - - [704, 3584, 1, 1280] - - [505, 7860.33] + - [611, 7860.33] - - [2944, 6784, 1, 1280] - - [515, 8894.6] + - [621, 8894.6] - - [1856, 6784, 1, 256] - - [515, 8115.19] + - [621, 8115.19] - - [4288, 448, 1, 3328] - - [507, 6397.35] + - [613, 6397.35] - - [6784, 4288, 1, 128] - - [493, 4109.54] + - [599, 4109.54] - - [6784, 704, 1, 1280] - - [503, 7999.14] + - [609, 7999.14] - - [256, 4288, 1, 256] - - [507, 4603.94] + - [613, 4603.94] - - [3584, 6784, 1, 256] - - [515, 7361.65] + - [621, 7361.65] - - [6144, 12000, 1, 2048] - - [514, 6311.76] + - [620, 6311.76] - - [6144, 16, 1, 2560] - - [463, 2240.65] + - [569, 2240.65] - - [3584, 64, 1, 128] - - [434, 1292.36] + - [540, 1292.36] - - [5888, 1024, 1, 3328] - - [502, 8394.59] + - [608, 8394.59] - - [448, 64, 1, 128] - - [425, 262.244] + - [531, 262.244] - - [704, 6784, 1, 1280] - - [509, 7740.66] + - [615, 7740.66] - - [4, 1024, 1, 1280] - - [462, 378.921] + - [568, 378.921] - - [5888, 128, 1, 256] - - [510, 5003.68] + - [616, 5003.68] - - [4096, 16, 1, 4096] - - [462, 1585.85] + - [568, 1585.85] - - [1856, 5056, 1, 3328] - - [503, 8522.92] + - [609, 8522.92] - - [4, 6784, 1, 256] - - [447, 387.757] + - [553, 387.757] - - [1024, 3584, 1, 128] - - [497, 3031.61] + - [603, 3031.61] - - [1024, 1408, 1, 128] - - [499, 2600.85] + - [605, 2600.85] - - [2368, 2944, 1, 128] - - [496, 4340.26] + - [602, 4340.26] - - [5056, 64, 1, 256] - - [510, 3109.62] + - [616, 3109.62] - - [4, 448, 1, 1280] - - [530, 253.835] + - [636, 253.835] - - [5056, 2944, 1, 128] - - [501, 3740.01] + - [607, 3740.01] - - [5888, 5056, 1, 3328] - - [515, 9016.48] + - [621, 9016.48] - - [1024, 704, 1, 128] - - [497, 2363.66] + - [603, 2363.66] - - [5888, 2368, 1, 128] - - [500, 3651.83] + - [606, 3651.83] - - [128, 5056, 1, 3328] - - [504, 6243.64] + - [610, 6243.64] - - [3584, 6784, 1, 1280] - - [502, 9080.67] + - [608, 9080.67] - - [448, 4, 1, 1280] - - [530, 243.083] + - [636, 243.083] - - [1856, 5888, 1, 256] - - [515, 8182.12] + - [621, 8182.12] - - [256, 256, 1, 256] - - [448, 1542.12] + - [554, 1542.12] - - [256, 64, 1, 128] - - [429, 135.226] + - [535, 135.226] - - [4288, 4288, 1, 3328] - - [515, 8674.64] + - [621, 8674.64] - - [4288, 1408, 1, 1280] - - [503, 7867.18] + - [609, 7867.18] - - [3584, 5056, 1, 128] - - [493, 4457.83] + - [599, 4457.83] - - [4, 1024, 1, 3328] - - [442, 440.394] + - [548, 440.394] - - [4288, 2368, 1, 256] - - [523, 5699.57] + - [629, 5699.57] - - [2944, 5056, 1, 1280] - - [515, 8236.56] + - [621, 8236.56] - - [448, 6784, 1, 256] - - [505, 6620.62] + - [611, 6620.62] - - [64, 128, 1, 128] - - [430, 67.6629] + - [536, 67.6629] - - [1856, 2368, 1, 128] - - [497, 4233.7] + - [603, 4233.7] - - [6784, 2368, 1, 3328] - - [515, 8269.9] + - [621, 8269.9] - - [256, 1024, 1, 1280] - - [504, 4882.88] + - [610, 4882.88] - - [704, 4, 1, 128] - - [524, 19.111] + - [630, 19.111] - - [256, 4, 1, 256] - - [462, 46.9114] + - [568, 46.9114] - - [4288, 128, 1, 256] - - [510, 4273.49] + - [616, 4273.49] - - [4288, 1856, 1, 3328] - - [505, 8195.81] + - [611, 8195.81] - - [3584, 448, 1, 128] - - [498, 2750.65] + - [604, 2750.65] - - [2048, 1600, 1, 2048] - - [521, 5753.59] + - [627, 5753.59] - - [256, 4, 1, 3328] - - [531, 297.978] + - [637, 297.978] - - [4, 1408, 1, 1280] - - [529, 402.386] + - [635, 402.386] - - [3584, 64, 1, 1280] - - [518, 4096.1] + - [624, 4096.1] - - [1408, 448, 1, 128] - - [493, 2498.25] + - [599, 2498.25] - - [3584, 1024, 1, 1280] - - [515, 7252.18] + - [621, 7252.18] - - [1856, 5056, 1, 256] - - [509, 7711.59] + - [615, 7711.59] - - [4, 3584, 1, 256] - - [526, 314.314] + - [632, 314.314] - - [4, 2944, 1, 1280] - - [462, 483.218] + - [568, 483.218] - - [1024, 4288, 1, 256] - - [514, 6544.52] + - [620, 6544.52] - - [5888, 3584, 1, 3328] - - [503, 8105.15] + - [609, 8105.15] - - [1856, 4, 1, 256] - - [462, 252.832] + - [568, 252.832] - - [4, 256, 1, 256] - - [447, 48.2882] + - [553, 48.2882] - - [5056, 3584, 1, 3328] - - [508, 7354.8] + - [614, 7354.8] - - [704, 448, 1, 128] - - [501, 1233.91] + - [607, 1233.91] - - [2368, 1408, 1, 1280] - - [509, 6654.24] + - [615, 6654.24] - - [5056, 2944, 1, 1280] - - [515, 8505.72] + - [621, 8505.72] - - [4, 4, 1, 128] - - [525, 0.1478505] + - [631, 0.1478505] - - [3584, 256, 1, 256] - - [507, 4616.47] + - [613, 4616.47] - - [1024, 6784, 1, 256] - - [509, 7944.98] + - [615, 7944.98] - - [4, 128, 1, 256] - - [462, 29.3571] + - [568, 29.3571] - - [64, 64, 1, 1280] - - [473, 642.61] + - [579, 642.61] - - [5124, 9124, 1, 2048] - - [515, 8019.4] + - [621, 8019.4] - - [6784, 4, 1, 128] - - [524, 193.067] + - [630, 193.067] - - [2944, 1408, 1, 128] - - [493, 3827.13] + - [599, 3827.13] - - [448, 128, 1, 3328] - - [466, 4064.0] + - [572, 4064.0] - - [3584, 1408, 1, 1280] - - [515, 7180.83] + - [621, 7180.83] - - [64, 4288, 1, 3328] - - [461, 4786.84] + - [567, 4786.84] - - [5056, 6784, 1, 3328] - - [502, 7889.83] + - [608, 7889.83] - - [128, 2944, 1, 256] - - [505, 3599.69] + - [611, 3599.69] - - [128, 6784, 1, 128] - - [423, 2606.79] + - [529, 2606.79] - - [3584, 4288, 1, 256] - - [509, 7299.81] + - [615, 7299.81] - - [448, 1856, 1, 256] - - [505, 5207.07] + - [611, 5207.07] - - [1856, 6784, 1, 3328] - - [507, 8386.36] + - [613, 8386.36] - - [3584, 128, 1, 3328] - - [451, 5590.04] + - [557, 5590.04] - - [64, 1856, 1, 256] - - [444, 1949.38] + - [550, 1949.38] - - [64, 448, 1, 256] - - [449, 955.833] + - [555, 955.833] - - [5888, 4288, 1, 256] - - [513, 7791.84] + - [619, 7791.84] - - [4, 448, 1, 128] - - [524, 8.84146] + - [630, 8.84146] - - [5056, 1408, 1, 256] - - [515, 5154.01] + - [621, 5154.01] - - [35, 8457, 1, 2048] - - [420, 3182.57] + - [526, 3182.57] - - [64, 256, 1, 1280] - - [469, 1713.46] + - [575, 1713.46] - - [3584, 1024, 1, 256] - - [505, 6528.18] + - [611, 6528.18] - - [256, 704, 1, 256] - - [504, 2720.46] + - [610, 2720.46] - - [5888, 5888, 1, 256] - - [513, 7992.26] + - [619, 7992.26] - - [4288, 1024, 1, 1280] - - [507, 7837.5] + - [613, 7837.5] - - [5888, 128, 1, 3328] - - [510, 7181.13] + - [616, 7181.13] - - [448, 6784, 1, 3328] - - [504, 7663.1] + - [610, 7663.1] - - [2944, 1408, 1, 1280] - - [513, 7903.14] + - [619, 7903.14] - - [64, 128, 1, 1280] - - [462, 1191.66] + - [568, 1191.66] - - [2944, 1856, 1, 3328] - - [503, 7844.41] + - [609, 7844.41] - - [2368, 64, 1, 128] - - [434, 997.973] + - [540, 997.973] - - [256, 1024, 1, 128] - - [493, 1215.84] + - [599, 1215.84] - - [3584, 5888, 1, 1280] - - [502, 8958.94] + - [608, 8958.94] - - [64, 4, 1, 128] - - [525, 1.21608] + - [631, 1.21608] - - [6784, 1856, 1, 1280] - - [502, 6728.8] + - [608, 6728.8] - - [2944, 5056, 1, 256] - - [515, 8275.21] + - [621, 8275.21] - - [4288, 4, 1, 128] - - [524, 147.644] + - [630, 147.644] - - [5888, 256, 1, 3328] - - [511, 7094.2] + - [617, 7094.2] - - [2944, 4288, 1, 128] - - [496, 4611.55] + - [602, 4611.55] - - [3584, 1408, 1, 256] - - [506, 6543.06] + - [612, 6543.06] - - [704, 3584, 1, 3328] - - [505, 8117.2] + - [611, 8117.2] - - [4096, 3200, 1, 1024] - - [520, 6656.13] + - [626, 6656.13] - - [5056, 448, 1, 1280] - - [518, 6096.2] + - [624, 6096.2] - - [3584, 1856, 1, 3328] - - [503, 8552.41] + - [609, 8552.41] - - [4288, 6784, 1, 1280] - - [509, 8212.46] + - [615, 8212.46] - - [2560, 7000, 1, 2560] - - [511, 7655.34] + - [617, 7655.34] - - [1408, 704, 1, 1280] - - [507, 5756.79] + - [613, 5756.79] - - [2944, 1024, 1, 256] - - [515, 6880.91] + - [621, 6880.91] - - [6784, 64, 1, 256] - - [510, 4438.96] + - [616, 4438.96] - - [2368, 4288, 1, 3328] - - [511, 8377.99] + - [617, 8377.99] - - [4, 1408, 1, 256] - - [528, 222.599] + - [634, 222.599] - - [1024, 1408, 1, 1280] - - [505, 6339.38] + - [611, 6339.38] - - [64, 64, 1, 256] - - [462, 187.346] + - [568, 187.346] - - [704, 256, 1, 3328] - - [504, 4046.14] + - [610, 4046.14] - - [6784, 5056, 1, 256] - - [515, 7972.17] + - [621, 7972.17] - - [1856, 1856, 1, 128] - - [499, 3716.61] + - [605, 3716.61] - - [3584, 5056, 1, 3328] - - [515, 8684.76] + - [621, 8684.76] - - [448, 6784, 1, 128] - - [497, 3829.05] + - [603, 3829.05] - - [4, 704, 1, 3328] - - [530, 393.206] + - [636, 393.206] - - [35, 8457, 1, 4096] - - [419, 3173.24] + - [525, 3173.24] - - [448, 2944, 1, 256] - - [513, 5553.41] + - [619, 5553.41] - - [4, 4288, 1, 3328] - - [472, 573.211] + - [578, 573.211] - - [2944, 6784, 1, 256] - - [509, 8566.06] + - [615, 8566.06] - - [2944, 2944, 1, 128] - - [493, 4540.83] + - [599, 4540.83] - - [4, 4, 1, 1280] - - [472, 3.14762] + - [578, 3.14762] - - [1856, 3584, 1, 1280] - - [509, 7306.36] + - [615, 7306.36] - - [64, 2944, 1, 256] - - [521, 2292.61] + - [627, 2292.61] - - [448, 256, 1, 128] - - [430, 797.93] + - [536, 797.93] - - [4288, 448, 1, 128] - - [496, 3430.5] + - [602, 3430.5] - - [4608, 24000, 1, 1536] - - [514, 6820.24] + - [620, 6820.24] - - [1856, 1408, 1, 3328] - - [517, 6600.24] + - [623, 6600.24] - - [128, 128, 1, 128] - - [422, 161.917] + - [528, 161.917] - - [1024, 4288, 1, 3328] - - [505, 7937.08] + - [611, 7937.08] - - [448, 2368, 1, 256] - - [513, 4526.45] + - [619, 4526.45] - - [1024, 4, 1, 128] - - [525, 16.9907] + - [631, 16.9907] - - [64, 1408, 1, 1280] - - [445, 3345.32] + - [551, 3345.32] - - [64, 6784, 1, 1280] - - [510, 5526.6] + - [616, 5526.6] - - [5056, 448, 1, 256] - - [504, 4216.65] + - [610, 4216.65] - - [2944, 2368, 1, 3328] - - [515, 7000.42] + - [621, 7000.42] - - [704, 4288, 1, 3328] - - [521, 6414.43] + - [627, 6414.43] - - [1408, 128, 1, 256] - - [504, 2720.46] + - [610, 2720.46] - - [1024, 1856, 1, 1280] - - [515, 7682.93] + - [621, 7682.93] - - [2048, 6400, 1, 2048] - - [511, 7418.22] + - [617, 7418.22] - - [512, 48000, 1, 2816] - - [515, 8884.77] + - [621, 8884.77] - - [5124, 9124, 1, 2560] - - [507, 6040.8] + - [613, 6040.8] - - [128, 2368, 1, 3328] - - [461, 5025.66] + - [567, 5025.66] - - [1024, 5888, 1, 256] - - [509, 7322.21] + - [615, 7322.21] - - [64, 2944, 1, 1280] - - [445, 4222.31] + - [551, 4222.31] - - [5056, 64, 1, 3328] - - [486, 4936.32] + - [592, 4936.32] - - [128, 704, 1, 128] - - [431, 683.414] + - [537, 683.414] - - [1408, 2368, 1, 256] - - [510, 6404.22] + - [616, 6404.22] - - [1408, 1408, 1, 256] - - [515, 4537.93] + - [621, 4537.93] - - [4, 64, 1, 128] - - [524, 2.56747] + - [630, 2.56747] - - [64, 1024, 1, 128] - - [423, 532.372] + - [529, 532.372] - - [1024, 8, 1, 500000] - - [413, 1685.08] + - [519, 1685.08] - - [2368, 2368, 1, 128] - - [494, 4334.33] + - [600, 4334.33] - - [64, 5888, 1, 128] - - [423, 2003.19] + - [529, 2003.19] - - [5888, 4, 1, 3328] - - [441, 339.118] + - [547, 339.118] - - [6784, 1408, 1, 128] - - [497, 4431.23] + - [603, 4431.23] - - [4288, 5888, 1, 256] - - [515, 7800.88] + - [621, 7800.88] - - [1408, 5056, 1, 256] - - [509, 8153.38] + - [615, 8153.38] - - [5056, 128, 1, 3328] - - [466, 5829.93] + - [572, 5829.93] - - [128, 128, 1, 1280] - - [469, 1691.35] + - [575, 1691.35] - - [448, 704, 1, 256] - - [510, 3364.28] + - [616, 3364.28] - - [4288, 3584, 1, 128] - - [494, 2952.68] + - [600, 2952.68] - - [2944, 128, 1, 3328] - - [466, 5620.82] + - [572, 5620.82] - - [64, 1408, 1, 3328] - - [467, 4169.91] + - [573, 4169.91] - - [3584, 5056, 1, 1280] - - [512, 7780.76] + - [618, 7780.76] - - [256, 448, 1, 1280] - - [445, 3929.45] + - [551, 3929.45] - - [704, 704, 1, 128] - - [493, 2346.17] + - [599, 2346.17] - - [5056, 4, 1, 128] - - [524, 144.557] + - [630, 144.557] - - [704, 256, 1, 1280] - - [513, 2283.22] + - [619, 2283.22] - - [64, 2368, 1, 3328] - - [445, 4921.69] + - [551, 4921.69] - - [1856, 1024, 1, 128] - - [494, 3459.57] + - [600, 3459.57] - - [1856, 64, 1, 128] - - [426, 918.237] + - [532, 918.237] - - [4096, 64, 1, 4096] - - [471, 4000.62] + - [577, 4000.62] - - [1024, 24000, 1, 1536] - - [507, 8502.36] + - [613, 8502.36] - - [704, 4288, 1, 256] - - [511, 6003.83] + - [617, 6003.83] - - [5888, 2368, 1, 1280] - - [502, 8801.3] + - [608, 8801.3] - - [128, 256, 1, 256] - - [456, 1070.08] + - [562, 1070.08] - - [64, 128, 1, 256] - - [462, 374.591] + - [568, 374.591] - - [2368, 5888, 1, 1280] - - [505, 8308.63] + - [611, 8308.63] - - [5888, 256, 1, 1280] - - [513, 7154.42] + - [619, 7154.42] - - [1760, 128, 1, 1760] - - [454, 5363.91] + - [560, 5363.91] - - [4, 5888, 1, 1280] - - [462, 542.304] + - [568, 542.304] - - [704, 128, 1, 128] - - [434, 779.447] + - [540, 779.447] - - [1024, 4, 1, 1280] - - [462, 392.531] + - [568, 392.531] - - [2368, 1856, 1, 3328] - - [505, 7975.32] + - [611, 7975.32] - - [2368, 128, 1, 128] - - [427, 1584.96] + - [533, 1584.96] - - [2944, 704, 1, 256] - - [513, 4039.21] + - [619, 4039.21] - - [5056, 128, 1, 128] - - [493, 2575.89] + - [599, 2575.89] - - [2368, 1024, 1, 3328] - - [521, 6165.54] + - [627, 6165.54] - - [256, 704, 1, 3328] - - [504, 4028.74] + - [610, 4028.74] - - [704, 3584, 1, 256] - - [515, 6102.92] + - [621, 6102.92] - - [704, 2944, 1, 3328] - - [505, 8202.84] + - [611, 8202.84] - - [6784, 1024, 1, 128] - - [497, 4386.4] + - [603, 4386.4] - - [256, 448, 1, 128] - - [434, 834.195] + - [540, 834.195] - - [448, 1024, 1, 3328] - - [522, 5412.48] + - [628, 5412.48] - - [2944, 1024, 1, 3328] - - [515, 6265.87] + - [621, 6265.87] - - [2944, 5056, 1, 128] - - [493, 4770.88] + - [599, 4770.88] - - [2368, 256, 1, 256] - - [510, 3975.23] + - [616, 3975.23] - - [1408, 6784, 1, 256] - - [509, 7987.02] + - [615, 7987.02] - - [6784, 1408, 1, 3328] - - [509, 8472.71] + - [615, 8472.71] - - [4288, 6784, 1, 128] - - [500, 3865.2] + - [606, 3865.2] - - [704, 64, 1, 256] - - [448, 1287.41] + - [554, 1287.41] - - [5888, 4, 1, 1280] - - [447, 510.022] + - [553, 510.022] - - [256, 2368, 1, 3328] - - [510, 5837.65] + - [616, 5837.65] - - [6784, 2944, 1, 1280] - - [515, 8560.54] + - [621, 8560.54] - - [4288, 1856, 1, 128] - - [493, 4617.07] + - [599, 4617.07] - - [1856, 2944, 1, 128] - - [493, 4287.73] + - [599, 4287.73] - - [6784, 448, 1, 128] - - [497, 3893.43] + - [603, 3893.43] - - [64, 3584, 1, 128] - - [423, 1609.76] + - [529, 1609.76] - - [448, 5056, 1, 1280] - - [513, 7124.41] + - [619, 7124.41] - - [2368, 1856, 1, 128] - - [496, 4004.65] + - [602, 4004.65] - - [64, 2944, 1, 3328] - - [446, 5086.48] + - [552, 5086.48] - - [4288, 704, 1, 256] - - [511, 6176.57] + - [617, 6176.57] - - [256, 3584, 1, 128] - - [494, 2553.15] + - [600, 2553.15] - - [5888, 704, 1, 256] - - [510, 6781.51] + - [616, 6781.51] - - [3584, 1024, 1, 128] - - [497, 3660.95] + - [603, 3660.95] - - [256, 5888, 1, 3328] - - [513, 7772.13] + - [619, 7772.13] - - [1408, 4288, 1, 3328] - - [509, 8832.86] + - [615, 8832.86] - - [6784, 4288, 1, 256] - - [515, 8566.14] + - [621, 8566.14] - - [4288, 256, 1, 128] - - [495, 1953.79] + - [601, 1953.79] - - [5888, 256, 1, 256] - - [513, 3730.53] + - [619, 3730.53] - - [6784, 1024, 1, 1280] - - [509, 8578.39] + - [615, 8578.39] - - [5888, 1024, 1, 128] - - [494, 4092.96] + - [600, 4092.96] - - [1024, 128, 1, 256] - - [444, 1897.98] + - [550, 1897.98] - - [512, 16, 1, 500000] - - [415, 2363.79] + - [521, 2363.79] - - [128, 64, 1, 3328] - - [472, 1592.56] + - [578, 1592.56] - - [448, 64, 1, 256] - - [462, 976.168] + - [568, 976.168] - - [2368, 256, 1, 128] - - [497, 2094.99] + - [603, 2094.99] - - [6784, 3584, 1, 1280] - - [509, 8570.16] + - [615, 8570.16] - - [1024, 6784, 1, 1280] - - [515, 8203.57] + - [621, 8203.57] - - [2944, 64, 1, 1280] - - [453, 4300.61] + - [559, 4300.61] - - [1408, 2944, 1, 1280] - - [505, 7349.64] + - [611, 7349.64] - - [256, 1856, 1, 256] - - [504, 4649.75] + - [610, 4649.75] - - [2048, 800, 1, 2048] - - [523, 4668.73] + - [629, 4668.73] - - [1408, 2368, 1, 3328] - - [513, 7537.74] + - [619, 7537.74] - - [2944, 4, 1, 3328] - - [462, 514.142] + - [568, 514.142] - - [128, 1408, 1, 3328] - - [454, 4991.64] + - [560, 4991.64] - - [2944, 1856, 1, 128] - - [493, 4317.39] + - [599, 4317.39] - - [256, 2944, 1, 128] - - [493, 2258.27] + - [599, 2258.27] - - [256, 6784, 1, 128] - - [493, 3147.02] + - [599, 3147.02] - - [2368, 4, 1, 128] - - [525, 33.9286] + - [631, 33.9286] - - [1408, 256, 1, 3328] - - [504, 5077.85] + - [610, 5077.85] - - [1856, 4, 1, 128] - - [525, 21.5025] + - [631, 21.5025] - - [5056, 6784, 1, 128] - - [493, 4945.11] + - [599, 4945.11] - - [4288, 5056, 1, 128] - - [496, 4729.87] + - [602, 4729.87] - - [1856, 5888, 1, 128] - - [493, 4707.96] + - [599, 4707.96] - - [2944, 5888, 1, 256] - - [507, 8014.78] + - [613, 8014.78] - - [3584, 1856, 1, 256] - - [509, 7567.13] + - [615, 7567.13] - - [4288, 3584, 1, 1280] - - [502, 8726.43] + - [608, 8726.43] - - [2368, 448, 1, 256] - - [510, 4227.7] + - [616, 4227.7] - - [4288, 256, 1, 3328] - - [511, 5487.41] + - [617, 5487.41] - - [1856, 704, 1, 128] - - [497, 3125.06] + - [603, 3125.06] - - [1408, 64, 1, 256] - - [457, 1620.09] + - [563, 1620.09] - - [64, 1856, 1, 128] - - [421, 955.147] + - [527, 955.147] - - [4, 256, 1, 128] - - [524, 10.8789] + - [630, 10.8789] - - [2560, 16, 1, 2560] - - [469, 2019.7] + - [575, 2019.7] - - [704, 5888, 1, 128] - - [498, 3976.26] + - [604, 3976.26] - - [6784, 3584, 1, 128] - - [497, 4018.91] + - [603, 4018.91] - - [1024, 64, 1, 256] - - [462, 1370.79] + - [568, 1370.79] - - [64, 2368, 1, 256] - - [504, 2255.76] + - [610, 2255.76] - - [4288, 5056, 1, 3328] - - [509, 8368.69] + - [615, 8368.69] - - [4, 1856, 1, 1280] - - [462, 392.126] + - [568, 392.126] - - [4288, 128, 1, 128] - - [427, 2287.03] + - [533, 2287.03] - - [1408, 1408, 1, 128] - - [497, 3233.48] + - [603, 3233.48] - - [7680, 16, 1, 2560] - - [465, 2257.37] + - [571, 2257.37] - - [1856, 128, 1, 128] - - [427, 1532.8] + - [533, 1532.8] - - [5056, 2368, 1, 256] - - [509, 8167.29] + - [615, 8167.29] - - [4288, 704, 1, 3328] - - [515, 6411.16] + - [621, 6411.16] - - [448, 3584, 1, 256] - - [515, 5477.74] + - [621, 5477.74] - - [2368, 64, 1, 1280] - - [445, 3936.52] + - [551, 3936.52] - - [2368, 1024, 1, 1280] - - [511, 7688.82] + - [617, 7688.82] - - [2944, 1408, 1, 3328] - - [502, 7668.78] + - [608, 7668.78] - - [1408, 448, 1, 256] - - [504, 4863.98] + - [610, 4863.98] - - [1024, 1408, 1, 3328] - - [513, 7448.99] + - [619, 7448.99] - - [2944, 5888, 1, 1280] - - [503, 8208.57] + - [609, 8208.57] - - [1408, 4, 1, 1280] - - [442, 479.419] + - [548, 479.419] - - [5888, 3584, 1, 256] - - [503, 8610.09] + - [609, 8610.09] - - [2368, 5056, 1, 128] - - [500, 3726.25] + - [606, 3726.25] - - [1408, 1856, 1, 3328] - - [504, 7829.48] + - [610, 7829.48] - - [4, 4, 1, 3328] - - [531, 4.39419] + - [637, 4.39419] - - [6784, 1408, 1, 1280] - - [504, 7690.8] + - [610, 7690.8] - - [4096, 7000, 1, 4096] - - [516, 6272.49] + - [622, 6272.49] - - [704, 2944, 1, 256] - - [505, 6095.91] + - [611, 6095.91] - - [4288, 64, 1, 256] - - [470, 2121.31] + - [576, 2121.31] - - [6784, 5888, 1, 3328] - - [509, 8955.6] + - [615, 8955.6] - - [2368, 4288, 1, 128] - - [493, 4699.65] + - [599, 4699.65] - - [64, 4288, 1, 1280] - - [483, 4013.73] + - [589, 4013.73] - - [6784, 64, 1, 1280] - - [504, 5418.83] + - [610, 5418.83] - - [3584, 128, 1, 128] - - [433, 2165.3] + - [539, 2165.3] - - [1024, 6784, 1, 128] - - [494, 3765.3] + - [600, 3765.3] - - [4, 1856, 1, 128] - - [525, 33.3728] + - [631, 33.3728] - - [1408, 64, 1, 3328] - - [466, 4489.51] + - [572, 4489.51] - - [6784, 4, 1, 256] - - [462, 400.262] + - [568, 400.262] - - [1408, 1408, 1, 1280] - - [509, 8139.53] + - [615, 8139.53] - - [16384, 400, 1, 4096] - - [513, 6087.28] + - [619, 6087.28] - - [256, 2368, 1, 256] - - [504, 4766.35] + - [610, 4766.35] - - [448, 4288, 1, 3328] - - [511, 7577.08] + - [617, 7577.08] - - [2368, 1408, 1, 256] - - [507, 5284.53] + - [613, 5284.53] - - [5888, 5056, 1, 128] - - [494, 3643.6] + - [600, 3643.6] - - [704, 2368, 1, 256] - - [509, 5334.73] + - [615, 5334.73] - - [1024, 24000, 1, 2560] - - [517, 7438.06] + - [623, 7438.06] - - [2944, 448, 1, 1280] - - [518, 4937.53] + - [624, 4937.53] - - [5888, 2368, 1, 3328] - - [503, 8201.84] + - [609, 8201.84] - - [5124, 9124, 1, 1760] - - [510, 6764.06] + - [616, 6764.06] - - [448, 1408, 1, 1280] - - [504, 5881.54] + - [610, 5881.54] - - [448, 1856, 1, 1280] - - [511, 6225.56] + - [617, 6225.56] - - [4288, 448, 1, 1280] - - [513, 5626.37] + - [619, 5626.37] - - [5888, 704, 1, 3328] - - [507, 7873.62] + - [613, 7873.62] - - [5056, 256, 1, 128] - - [498, 2921.03] + - [604, 2921.03] - - [1856, 256, 1, 128] - - [500, 1995.42] + - [606, 1995.42] - - [64, 1408, 1, 128] - - [421, 758.938] + - [527, 758.938] - - [704, 4, 1, 256] - - [462, 130.697] + - [568, 130.697] - - [1408, 5888, 1, 128] - - [493, 4574.05] + - [599, 4574.05] - - [7680, 12000, 1, 2560] - - [509, 8747.13] + - [615, 8747.13] - - [1408, 1024, 1, 256] - - [506, 4609.23] + - [612, 4609.23] - - [8192, 400, 1, 2048] - - [518, 5283.25] + - [624, 5283.25] - - [1024, 1856, 1, 128] - - [493, 2686.38] + - [599, 2686.38] - - [256, 704, 1, 128] - - [493, 1004.83] + - [599, 1004.83] - - [2560, 128, 1, 2560] - - [471, 4259.14] + - [577, 4259.14] - - [448, 1024, 1, 256] - - [504, 4813.24] + - [610, 4813.24] - - [128, 4, 1, 3328] - - [530, 128.408] + - [636, 128.408] - - [5056, 6784, 1, 1280] - - [512, 6579.85] + - [618, 6579.85] - - [1408, 64, 1, 128] - - [434, 819.3] + - [540, 819.3] - - [1024, 448, 1, 1280] - - [513, 5703.31] + - [619, 5703.31] - - [704, 5056, 1, 3328] - - [505, 7574.49] + - [611, 7574.49] - - [128, 5056, 1, 256] - - [504, 5113.53] + - [610, 5113.53] - - [64, 1024, 1, 3328] - - [489, 3980.1] + - [595, 3980.1] - - [1856, 4, 1, 3328] - - [443, 433.253] + - [549, 433.253] - - [4, 2944, 1, 128] - - [525, 46.6225] + - [631, 46.6225] - - [2368, 2944, 1, 3328] - - [503, 9002.13] + - [609, 9002.13] - - [448, 448, 1, 1280] - - [445, 3969.52] + - [551, 3969.52] - - [2368, 3584, 1, 256] - - [515, 7806.39] + - [621, 7806.39] - - [5056, 3584, 1, 1280] - - [502, 8971.56] + - [608, 8971.56] - - [5124, 9124, 1, 4096] - - [515, 7208.72] + - [621, 7208.72] - - [7680, 48000, 1, 2560] - - [509, 3835.91] + - [615, 3835.91] - - [448, 4, 1, 3328] - - [530, 409.7] + - [636, 409.7] - - [1856, 2944, 1, 1280] - - [502, 7173.71] + - [608, 7173.71] - - [1024, 48000, 1, 2816] - - [509, 8976.26] + - [615, 8976.26] - - [128, 1024, 1, 256] - - [448, 1969.26] + - [554, 1969.26] - - [2944, 1408, 1, 256] - - [511, 4585.12] + - [617, 4585.12] - - [4288, 1408, 1, 3328] - - [505, 8237.27] + - [611, 8237.27] - - [3584, 64, 1, 3328] - - [451, 5183.16] + - [557, 5183.16] - - [5888, 2944, 1, 128] - - [500, 3674.56] + - [606, 3674.56] - - [2944, 1024, 1, 128] - - [497, 3834.32] + - [603, 3834.32] - - [4288, 5056, 1, 1280] - - [509, 8086.1] + - [615, 8086.1] - - [5888, 6784, 1, 1280] - - [503, 6941.32] + - [609, 6941.32] - - [6784, 5056, 1, 128] - - [494, 4860.15] + - [600, 4860.15] - - [256, 1024, 1, 3328] - - [518, 5156.22] + - [624, 5156.22] - - [3584, 4, 1, 256] - - [462, 332.529] + - [568, 332.529] - - [1760, 1600, 1, 1760] - - [505, 6330.76] + - [611, 6330.76] - - [1856, 64, 1, 3328] - - [466, 4756.03] + - [572, 4756.03] - - [4, 128, 1, 3328] - - [530, 160.244] + - [636, 160.244] - - [5888, 1408, 1, 3328] - - [503, 8722.74] + - [609, 8722.74] - - [448, 2944, 1, 128] - - [496, 2997.63] + - [602, 2997.63] - - [2368, 1856, 1, 256] - - [504, 6662.34] + - [610, 6662.34] - - [256, 5056, 1, 256] - - [506, 5256.29] + - [612, 5256.29] - - [128, 3584, 1, 128] - - [425, 2073.56] + - [531, 2073.56] - - [448, 3584, 1, 3328] - - [502, 6833.96] + - [608, 6833.96] - - [4, 5056, 1, 3328] - - [472, 581.523] + - [578, 581.523] - - [704, 2368, 1, 128] - - [493, 3402.29] + - [599, 3402.29] - - [5888, 256, 1, 128] - - [498, 2977.54] + - [604, 2977.54] - - [4, 5056, 1, 128] - - [524, 65.2074] + - [630, 65.2074] - - [448, 256, 1, 256] - - [510, 1764.53] + - [616, 1764.53] - - [704, 4, 1, 3328] - - [462, 398.554] + - [568, 398.554] - - [1408, 256, 1, 256] - - [505, 3463.86] + - [611, 3463.86] - - [3584, 1856, 1, 128] - - [501, 3228.19] + - [607, 3228.19] - - [4288, 4288, 1, 128] - - [497, 4853.93] + - [603, 4853.93] - - [1856, 1024, 1, 3328] - - [521, 5994.68] + - [627, 5994.68] - - [128, 5888, 1, 3328] - - [475, 6512.85] + - [581, 6512.85] - - [1024, 5056, 1, 256] - - [515, 7859.42] + - [621, 7859.42] - - [5888, 5888, 1, 1280] - - [515, 8131.44] + - [621, 8131.44] - - [5056, 5888, 1, 128] - - [494, 4920.71] + - [600, 4920.71] - - [2368, 1408, 1, 3328] - - [513, 7110.74] + - [619, 7110.74] - - [1024, 48000, 1, 1536] - - [513, 8590.82] + - [619, 8590.82] - - [5888, 448, 1, 256] - - [514, 3567.74] + - [620, 3567.74] - - [2560, 3200, 1, 2560] - - [504, 7638.31] + - [610, 7638.31] - - [5888, 6784, 1, 128] - - [494, 3910.92] + - [600, 3910.92] - - [6144, 48000, 1, 2048] - - [515, 3412.95] + - [621, 3412.95] - - [6784, 5056, 1, 1280] - - [506, 7890.22] + - [612, 7890.22] - - [5056, 704, 1, 1280] - - [510, 7665.06] + - [616, 7665.06] - - [1024, 48000, 1, 2560] - - [515, 8188.5] + - [621, 8188.5] - - [4608, 32, 1, 1536] - - [483, 2856.97] + - [589, 2856.97] - - [1024, 2368, 1, 128] - - [493, 3019.35] + - [599, 3019.35] - - [128, 704, 1, 256] - - [444, 1696.33] + - [550, 1696.33] - - [2368, 448, 1, 3328] - - [510, 5799.29] + - [616, 5799.29] - - [128, 5888, 1, 1280] - - [504, 6680.75] + - [610, 6680.75] - - [16384, 800, 1, 4096] - - [509, 6322.22] + - [615, 6322.22] - - [448, 128, 1, 1280] - - [483, 2849.49] + - [589, 2849.49] - - [6784, 4, 1, 3328] - - [462, 563.12] + - [568, 563.12] - - [5888, 5056, 1, 1280] - - [509, 8631.33] + - [615, 8631.33] - - [1024, 64, 1, 3328] - - [484, 3481.96] + - [590, 3481.96] - - [3072, 48000, 1, 1024] - - [509, 9019.49] + - [615, 9019.49] - - [64, 3584, 1, 1280] - - [446, 4327.95] + - [552, 4327.95] - - [6784, 1408, 1, 256] - - [509, 6320.59] + - [615, 6320.59] - - [3584, 5888, 1, 128] - - [496, 4406.79] + - [602, 4406.79] - - [5056, 5888, 1, 256] - - [515, 8037.13] + - [621, 8037.13] - - [2368, 1024, 1, 256] - - [507, 4936.14] + - [613, 4936.14] - - [2944, 1856, 1, 256] - - [515, 7222.32] + - [621, 7222.32] - - [1856, 6784, 1, 1280] - - [505, 8251.81] + - [611, 8251.81] - - [64, 5056, 1, 128] - - [425, 1643.7] + - [531, 1643.7] - - [64, 6784, 1, 128] - - [423, 1929.77] + - [529, 1929.77] - - [448, 704, 1, 128] - - [495, 979.959] + - [601, 979.959] - - [4, 1024, 1, 128] - - [524, 20.1416] + - [630, 20.1416] - - [4288, 3584, 1, 256] - - [509, 8444.14] + - [615, 8444.14] - - [1408, 704, 1, 128] - - [493, 3021.0] + - [599, 3021.0] - - [64, 256, 1, 3328] - - [489, 2227.47] + - [595, 2227.47] - - [6784, 448, 1, 3328] - - [515, 6573.11] + - [621, 6573.11] - - [5056, 1856, 1, 1280] - - [507, 7976.23] + - [613, 7976.23] - - [1408, 1024, 1, 3328] - - [505, 7470.33] + - [611, 7470.33] - - [2368, 256, 1, 3328] - - [510, 5394.37] + - [616, 5394.37] - - [5888, 3584, 1, 1280] - - [502, 9031.55] + - [608, 9031.55] - - [1856, 3584, 1, 3328] - - [517, 7272.6] + - [623, 7272.6] - - [5888, 128, 1, 1280] - - [510, 6684.48] + - [616, 6684.48] - - [1024, 2944, 1, 256] - - [515, 7415.09] + - [621, 7415.09] - - [448, 6784, 1, 1280] - - [511, 7923.78] + - [617, 7923.78] - - [256, 3584, 1, 1280] - - [507, 6901.87] + - [613, 6901.87] - - [704, 5056, 1, 256] - - [512, 5004.55] + - [618, 5004.55] - - [3584, 1024, 1, 3328] - - [504, 7894.63] + - [610, 7894.63] - - [2944, 1856, 1, 1280] - - [509, 7903.27] + - [615, 7903.27] - - [128, 256, 1, 128] - - [422, 325.745] + - [528, 325.745] - - [5056, 256, 1, 256] - - [506, 3356.56] + - [612, 3356.56] - - [2944, 4288, 1, 3328] - - [515, 7813.93] + - [621, 7813.93] - - [2368, 3584, 1, 3328] - - [515, 8371.09] + - [621, 8371.09] - - [2944, 704, 1, 1280] - - [521, 5514.09] + - [627, 5514.09] - - [128, 4, 1, 256] - - [462, 25.3062] + - [568, 25.3062] - - [2944, 3584, 1, 1280] - - [509, 7738.83] + - [615, 7738.83] - - [1856, 5888, 1, 1280] - - [503, 8584.63] + - [609, 8584.63] - - [256, 256, 1, 1280] - - [483, 2962.18] + - [589, 2962.18] - - [2048, 3200, 1, 2048] - - [511, 6911.69] + - [617, 6911.69] - - [4288, 1408, 1, 256] - - [509, 7954.0] + - [615, 7954.0] - - [3584, 64, 1, 256] - - [510, 2780.42] + - [616, 2780.42] - - [64, 1856, 1, 3328] - - [445, 4912.04] + - [551, 4912.04] - - [256, 1408, 1, 128] - - [493, 1373.24] + - [599, 1373.24] - - [5888, 1408, 1, 128] - - [498, 4242.01] + - [604, 4242.01] - - [4288, 2368, 1, 1280] - - [507, 8012.7] + - [613, 8012.7] - - [4, 4288, 1, 256] - - [528, 301.674] + - [634, 301.674] - - [256, 4288, 1, 128] - - [493, 2706.36] + - [599, 2706.36] - - [2048, 128, 1, 2048] - - [488, 2885.26] + - [594, 2885.26] - - [256, 128, 1, 3328] - - [490, 3170.21] + - [596, 3170.21] - - [512, 8, 1, 500000] - - [414, 1915.12] + - [520, 1915.12] - - [6784, 2368, 1, 256] - - [509, 8323.66] + - [615, 8323.66] - - [5888, 128, 1, 128] - - [497, 2466.08] + - [603, 2466.08] - - [1024, 24000, 1, 2816] - - [507, 8131.64] + - [613, 8131.64] - - [7680, 5984, 1, 2560] - - [511, 6040.77] + - [617, 6040.77] - - [4288, 1856, 1, 256] - - [523, 5818.53] + - [629, 5818.53] - - [1856, 256, 1, 3328] - - [504, 6532.03] + - [610, 6532.03] - - [1856, 2944, 1, 256] - - [509, 7312.92] + - [615, 7312.92] - - [5056, 1024, 1, 128] - - [499, 4103.0] + - [605, 4103.0] - - [64, 5888, 1, 1280] - - [504, 5058.25] + - [610, 5058.25] - - [1760, 800, 1, 1760] - - [507, 7280.0] + - [613, 7280.0] - - [6784, 256, 1, 128] - - [497, 3257.69] + - [603, 3257.69] - - [5888, 704, 1, 128] - - [493, 3813.93] + - [599, 3813.93] - - [1408, 2368, 1, 128] - - [494, 3561.27] + - [600, 3561.27] - - [1024, 4288, 1, 1280] - - [513, 7752.74] + - [619, 7752.74] - - [2368, 5056, 1, 3328] - - [516, 7711.91] + - [622, 7711.91] - - [448, 4, 1, 128] - - [524, 18.4795] + - [630, 18.4795] - - [4, 256, 1, 3328] - - [531, 269.71] + - [637, 269.71] - - [4288, 1024, 1, 3328] - - [510, 7910.27] + - [616, 7910.27] - - [6144, 48000, 1, 2560] - - [509, 3541.09] + - [615, 3541.09] - - [1024, 5056, 1, 3328] - - [503, 8509.66] + - [609, 8509.66] - - [1024, 1856, 1, 3328] - - [509, 7907.93] + - [615, 7907.93] - - [704, 704, 1, 1280] - - [521, 5648.15] + - [627, 5648.15] - - [128, 2368, 1, 1280] - - [480, 4145.11] + - [586, 4145.11] - - [1408, 128, 1, 3328] - - [453, 4919.6] + - [559, 4919.6] - - [3584, 256, 1, 1280] - - [505, 5185.56] + - [611, 5185.56] - - [4, 128, 1, 128] - - [524, 3.07891] + - [630, 3.07891] - - [5888, 64, 1, 1280] - - [453, 4499.59] + - [559, 4499.59] - - [3584, 128, 1, 1280] - - [510, 5929.01] + - [616, 5929.01] - - [4, 256, 1, 1280] - - [529, 170.767] + - [635, 170.767] - - [128, 704, 1, 3328] - - [453, 4379.37] + - [559, 4379.37] - - [4288, 6784, 1, 256] - - [503, 7181.09] + - [609, 7181.09] - - [3584, 2944, 1, 3328] - - [509, 8553.3] + - [615, 8553.3] - - [128, 1856, 1, 256] - - [510, 3207.77] + - [616, 3207.77] - - [64, 4288, 1, 256] - - [504, 2907.99] + - [610, 2907.99] - - [4, 3584, 1, 3328] - - [462, 560.605] + - [568, 560.605] - - [64, 4, 1, 3328] - - [531, 67.5025] + - [637, 67.5025] - - [4, 64, 1, 3328] - - [531, 88.8467] + - [637, 88.8467] - - [5888, 2944, 1, 256] - - [509, 7255.77] + - [615, 7255.77] - - [1856, 64, 1, 256] - - [455, 1743.72] + - [561, 1743.72] - - [5056, 128, 1, 1280] - - [510, 6009.79] + - [616, 6009.79] - - [448, 4288, 1, 1280] - - [511, 6466.82] + - [617, 6466.82] - - [448, 1856, 1, 3328] - - [511, 6381.99] + - [617, 6381.99] - - [1024, 4288, 1, 128] - - [496, 3491.87] + - [602, 3491.87] - - [4, 1024, 1, 256] - - [529, 172.563] + - [635, 172.563] - - [5056, 4288, 1, 256] - - [509, 8241.52] + - [615, 8241.52] - - [1024, 448, 1, 256] - - [513, 4218.51] + - [619, 4218.51] - - [1024, 3584, 1, 256] - - [509, 6513.69] + - [615, 6513.69] - - [2944, 128, 1, 1280] - - [453, 4710.48] + - [559, 4710.48] - - [2048, 32, 1, 2048] - - [468, 1779.23] + - [574, 1779.23] - - [64, 256, 1, 256] - - [462, 655.46] + - [568, 655.46] - - [1408, 4, 1, 128] - - [525, 20.1249] + - [631, 20.1249] - - [128, 2368, 1, 128] - - [425, 1707.73] + - [531, 1707.73] - - [256, 704, 1, 1280] - - [504, 3735.31] + - [610, 3735.31] - - [64, 2368, 1, 128] - - [432, 1049.81] + - [538, 1049.81] - - [6784, 6784, 1, 3328] - - [509, 9277.94] + - [615, 9277.94] - - [448, 5888, 1, 1280] - - [515, 7319.75] + - [621, 7319.75] - - [5056, 448, 1, 128] - - [497, 3694.43] + - [603, 3694.43] - - [4288, 704, 1, 1280] - - [507, 7890.96] + - [613, 7890.96] - - [3584, 2944, 1, 128] - - [499, 4124.71] + - [605, 4124.71] - - [6784, 256, 1, 1280] - - [515, 7185.83] + - [621, 7185.83] - - [256, 2944, 1, 1280] - - [504, 6736.76] + - [610, 6736.76] - - [64, 4288, 1, 128] - - [423, 1614.41] + - [529, 1614.41] - - [2368, 5888, 1, 3328] - - [505, 8616.46] + - [611, 8616.46] - - [4, 64, 1, 256] - - [442, 11.4778] + - [548, 11.4778] - - [704, 1024, 1, 3328] - - [510, 6801.92] + - [616, 6801.92] - - [2368, 1856, 1, 1280] - - [507, 7853.57] + - [613, 7853.57] - - [448, 5056, 1, 3328] - - [510, 7453.04] + - [616, 7453.04] - - [128, 448, 1, 128] - - [425, 530.449] + - [531, 530.449] - - [128, 6784, 1, 256] - - [505, 5557.55] + - [611, 5557.55] - - [3584, 4288, 1, 128] - - [496, 4462.73] + - [602, 4462.73] - - [64, 448, 1, 128] - - [425, 278.132] + - [531, 278.132] - - [5888, 4288, 1, 3328] - - [502, 9153.55] + - [608, 9153.55] - - [2368, 704, 1, 256] - - [509, 5350.78] + - [615, 5350.78] - - [256, 1856, 1, 3328] - - [504, 6536.35] + - [610, 6536.35] - - [1856, 128, 1, 256] - - [518, 2847.36] + - [624, 2847.36] - - [6784, 128, 1, 128] - - [498, 2530.82] + - [604, 2530.82] - - [3584, 1408, 1, 128] - - [499, 3625.62] + - [605, 3625.62] - - [1856, 5056, 1, 1280] - - [505, 8123.39] + - [611, 8123.39] - - [2944, 1024, 1, 1280] - - [515, 8450.41] + - [621, 8450.41] - - [5056, 4, 1, 256] - - [529, 380.787] + - [635, 380.787] - - [3584, 5888, 1, 3328] - - [507, 8567.99] + - [613, 8567.99] - - [2368, 4288, 1, 256] - - [511, 7858.07] + - [617, 7858.07] - - [1024, 2368, 1, 3328] - - [505, 6776.45] + - [611, 6776.45] - - [64, 704, 1, 3328] - - [460, 3503.52] + - [566, 3503.52] - - [704, 1408, 1, 256] - - [505, 6099.99] + - [611, 6099.99] - - [4096, 128, 1, 4096] - - [485, 4116.57] + - [591, 4116.57] - - [1024, 3584, 1, 1280] - - [515, 7231.65] + - [621, 7231.65] - - [4288, 5888, 1, 3328] - - [509, 8762.42] + - [615, 8762.42] - - [4288, 4, 1, 1280] - - [462, 492.797] + - [568, 492.797] - - [4608, 16, 1, 1536] - - [463, 1892.58] + - [569, 1892.58] - - [5888, 64, 1, 128] - - [440, 1747.73] + - [546, 1747.73] - - [4, 5888, 1, 128] - - [525, 84.5915] + - [631, 84.5915] - - [1024, 2944, 1, 3328] - - [513, 6907.05] + - [619, 6907.05] - - [6784, 1856, 1, 256] - - [509, 6274.07] + - [615, 6274.07] - - [2048, 64, 1, 2048] - - [492, 2371.44] + - [598, 2371.44] - - [256, 6784, 1, 1280] - - [509, 7067.04] + - [615, 7067.04] - - [1856, 3584, 1, 256] - - [515, 7706.87] + - [621, 7706.87] - - [128, 448, 1, 3328] - - [460, 3995.93] + - [566, 3995.93] - - [6784, 1856, 1, 128] - - [497, 4459.09] + - [603, 4459.09] - - [4, 448, 1, 256] - - [462, 84.4294] + - [568, 84.4294] - - [5056, 128, 1, 256] - - [510, 4954.5] + - [616, 4954.5] - - [512, 24000, 1, 2816] - - [503, 8994.98] + - [609, 8994.98] - - [256, 5888, 1, 1280] - - [502, 6184.0] + - [608, 6184.0] - - [4, 128, 1, 1280] - - [530, 71.9597] + - [636, 71.9597] - - [16384, 1600, 1, 4096] - - [509, 6921.09] + - [615, 6921.09] - - [6784, 128, 1, 1280] - - [513, 6486.37] + - [619, 6486.37] - - [64, 1408, 1, 256] - - [450, 1647.86] + - [556, 1647.86] - - [2368, 1408, 1, 128] - - [497, 3937.1] + - [603, 3937.1] - - [1856, 448, 1, 256] - - [510, 4635.57] + - [616, 4635.57] - - [1408, 1024, 1, 128] - - [493, 3208.51] + - [599, 3208.51] - - [128, 64, 1, 128] - - [422, 70.192] + - [528, 70.192] - - [6784, 3584, 1, 3328] - - [515, 8466.28] + - [621, 8466.28] - - [1760, 7000, 1, 1760] - - [513, 8149.21] + - [619, 8149.21] - - [2944, 64, 1, 3328] - - [446, 5018.09] + - [552, 5018.09] - - [64, 64, 1, 128] - - [422, 35.5249] + - [528, 35.5249] - - [2368, 5056, 1, 1280] - - [509, 8764.0] + - [615, 8764.0] - - [64, 4, 1, 1280] - - [531, 43.6745] + - [637, 43.6745] - - [1408, 2368, 1, 1280] - - [510, 7660.38] + - [616, 7660.38] - - [128, 1408, 1, 1280] - - [445, 4185.27] + - [551, 4185.27] - - [256, 64, 1, 3328] - - [470, 2071.75] + - [576, 2071.75] - - [704, 4288, 1, 128] - - [493, 4069.18] + - [599, 4069.18] - - [128, 1856, 1, 3328] - - [476, 5776.15] + - [582, 5776.15] - - [2944, 2944, 1, 256] - - [515, 7949.31] + - [621, 7949.31] - - [2944, 4, 1, 1280] - - [462, 483.218] + - [568, 483.218] - - [5888, 4, 1, 256] - - [447, 396.765] + - [553, 396.765] - - [6784, 256, 1, 256] - - [521, 4044.83] + - [627, 4044.83] - - [256, 5056, 1, 3328] - - [504, 7607.37] + - [610, 7607.37] - - [128, 4288, 1, 1280] - - [445, 4958.78] + - [551, 4958.78] - - [5056, 1856, 1, 128] - - [497, 4560.94] + - [603, 4560.94] - - [5056, 1024, 1, 3328] - - [509, 8634.18] + - [615, 8634.18] - - [128, 128, 1, 256] - - [447, 699.151] + - [553, 699.151] - - [1760, 64, 1, 1760] - - [453, 4580.65] + - [559, 4580.65] - - [4288, 3584, 1, 3328] - - [515, 9143.76] + - [621, 9143.76] - - [448, 704, 1, 3328] - - [504, 4473.43] + - [610, 4473.43] - - [448, 448, 1, 128] - - [435, 1264.38] + - [541, 1264.38] - - [1024, 2368, 1, 1280] - - [513, 7452.51] + - [619, 7452.51] - - [1856, 704, 1, 3328] - - [504, 6103.34] + - [610, 6103.34] - - [4, 2368, 1, 128] - - [524, 96.019] + - [630, 96.019] - - [5888, 6784, 1, 3328] - - [509, 9131.74] + - [615, 9131.74] - - [704, 4288, 1, 1280] - - [511, 7906.46] + - [617, 7906.46] - - [704, 256, 1, 256] - - [504, 2772.78] + - [610, 2772.78] - - [1024, 48000, 1, 2048] - - [508, 6513.45] + - [614, 6513.45] - - [4288, 1024, 1, 128] - - [493, 4291.77] - - - [512, 2048, 1, 49] - - [539, 4555.08] - - - [512, 128, 1, 784] - - [532, 3195.39] - - - [2048, 512, 1, 49] - - [540, 4253.43] - - - [1024, 256, 1, 196] - - [536, 4039.43] + - [599, 4291.77] - - [256, 64, 1, 3136] - - [534, 3015.37] + - [640, 3015.37] - - [256, 1024, 1, 196] - - [538, 4225.45] - - - [64, 256, 1, 3136] - - [535, 3058.45] - - - [128, 512, 1, 784] - - [533, 3380.38] - - - [64, 64, 1, 3136] - - [537, 1372.44] + - [644, 4225.45] - - [1024, 1024, 1, 3328] - - [650, 8705.1] + - [756, 8705.1] - - [2048, 200, 1, 3200] - - [655, 6173.42] + - [761, 6173.42] - - [1024, 200, 1, 13312] - - [553, 5213.31] + - [659, 5213.31] - - [1024, 256, 1, 1536] - - [655, 5859.43] + - [761, 5859.43] - - [4096, 256, 1, 12288] - - [660, 8807.52] + - [766, 8807.52] - - [64, 200, 1, 1024] - - [627, 366.632] + - [733, 366.632] - - [32, 512, 1, 1024] - - [582, 453.049] + - [688, 453.049] - - [2048, 256, 1, 3328] - - [644, 7876.73] + - [750, 7876.73] - - [4096, 512, 1, 32] - - [648, 3975.74] + - [754, 3975.74] - - [2048, 256, 1, 13312] - - [625, 7837.81] + - [731, 7837.81] - - [4096, 200, 1, 11264] - - [660, 6902.76] + - [766, 6902.76] - - [2048, 512, 1, 1024] - - [654, 8100.14] + - [760, 8100.14] - - [2048, 1024, 1, 1664] - - [554, 9082.08] + - [660, 9082.08] - - [1024, 1024, 1, 64] - - [650, 4258.28] + - [756, 4258.28] - - [512, 1024, 1, 1536] - - [644, 7597.33] + - [750, 7597.33] - - [1024, 256, 1, 15360] - - [545, 6735.24] + - [651, 6735.24] - - [1, 512, 1, 1024] - - [595, 15.1657] + - [701, 15.1657] - - [4096, 512, 1, 1408] - - [557, 9024.52] + - [663, 9024.52] - - [1024, 200, 1, 1408] - - [655, 4461.09] + - [761, 4461.09] - - [1024, 512, 1, 512] - - [649, 6528.2] + - [755, 6528.2] - - [4096, 256, 1, 15360] - - [656, 8824.03] + - [762, 8824.03] - - [2048, 512, 1, 640] - - [646, 7989.25] + - [752, 7989.25] - - [4096, 1024, 1, 1280] - - [552, 9421.54] + - [658, 9421.54] - - [1024, 200, 1, 6144] - - [644, 4966.52] + - [750, 4966.52] - - [1024, 1024, 1, 512] - - [646, 7731.54] + - [752, 7731.54] - - [128, 512, 1, 2048] - - [562, 2190.34] + - [668, 2190.34] - - [2048, 1024, 1, 640] - - [552, 8581.8] + - [658, 8581.8] - - [1024, 256, 1, 3328] - - [644, 6192.71] + - [750, 6192.71] - - [4096, 1024, 1, 13312] - - [557, 9642.59] + - [663, 9642.59] - - [2048, 256, 1, 2048] - - [644, 7485.75] + - [750, 7485.75] - - [2048, 1024, 1, 13312] - - [557, 9352.26] + - [663, 9352.26] - - [2048, 512, 1, 16640] - - [645, 8839.17] + - [751, 8839.17] - - [1024, 512, 1, 128] - - [649, 4280.0] + - [755, 4280.0] - - [2048, 1024, 1, 3584] - - [552, 9264.72] + - [658, 9264.72] - - [2048, 512, 1, 256] - - [660, 6990.61] + - [766, 6990.61] - - [512, 256, 1, 3200] - - [607, 4154.52] + - [713, 4154.52] - - [4096, 1024, 1, 1920] - - [552, 9535.32] + - [658, 9535.32] - - [4096, 200, 1, 2560] - - [657, 6754.65] + - [763, 6754.65] - - [1024, 256, 1, 16384] - - [547, 6289.6] + - [653, 6289.6] - - [1024, 1024, 1, 1152] - - [650, 8407.39] + - [756, 8407.39] - - [2048, 200, 1, 32] - - [593, 1412.51] + - [699, 1412.51] - - [512, 1024, 1, 2816] - - [644, 7843.25] + - [750, 7843.25] - - [4096, 256, 1, 14336] - - [656, 8844.77] + - [762, 8844.77] - - [1024, 200, 1, 4608] - - [655, 4931.74] + - [761, 4931.74] - - [1024, 200, 1, 16384] - - [550, 5135.15] + - [656, 5135.15] - - [64, 256, 1, 1024] - - [628, 461.013] + - [734, 461.013] - - [1, 200, 1, 1024] - - [610, 7.49884] + - [716, 7.49884] - - [2048, 200, 1, 2080] - - [655, 6033.87] + - [761, 6033.87] - - [512, 256, 1, 1792] - - [565, 3153.71] + - [671, 3153.71] - - [2048, 200, 1, 1024] - - [655, 5711.3] + - [761, 5711.3] - - [4096, 1024, 1, 12288] - - [552, 9658.23] + - [658, 9658.23] - - [4096, 200, 1, 4096] - - [646, 6834.55] + - [752, 6834.55] - - [1024, 512, 1, 11264] - - [613, 7686.46] + - [719, 7686.46] - - [128, 512, 1, 1024] - - [583, 1458.99] + - [689, 1458.99] - - [32, 256, 1, 2048] - - [601, 384.899] + - [707, 384.899] - - [1024, 200, 1, 1792] - - [655, 4638.64] + - [761, 4638.64] - - [1024, 1024, 1, 1792] - - [650, 8550.56] + - [756, 8550.56] - - [32, 256, 1, 512] - - [634, 161.419] + - [740, 161.419] - - [512, 200, 1, 2816] - - [560, 3353.1] + - [666, 3353.1] - - [512, 200, 1, 3072] - - [545, 3298.89] + - [651, 3298.89] - - [1024, 1024, 1, 8192] - - [591, 8369.1] + - [697, 8369.1] - - [1024, 256, 1, 12288] - - [548, 6475.71] + - [654, 6475.71] - - [4096, 200, 1, 768] - - [650, 6367.97] + - [756, 6367.97] - - [1024, 512, 1, 16384] - - [666, 7367.12] + - [772, 7367.12] - - [4096, 256, 1, 1024] - - [646, 8214.16] + - [752, 8214.16] - - [1024, 512, 1, 256] - - [649, 5537.13] + - [755, 5537.13] - - [4096, 1024, 1, 8320] - - [552, 9674.26] + - [658, 9674.26] - - [4096, 256, 1, 9216] - - [654, 8791.02] + - [760, 8791.02] - - [1024, 512, 1, 1408] - - [644, 7459.65] + - [750, 7459.65] - - [1024, 512, 1, 5632] - - [655, 7997.91] + - [761, 7997.91] - - [4096, 200, 1, 256] - - [660, 5371.9] + - [766, 5371.9] - - [1024, 200, 1, 128] - - [638, 1998.15] + - [744, 1998.15] - - [256, 200, 1, 1024] - - [607, 1196.01] + - [713, 1196.01] - - [1024, 200, 1, 5120] - - [655, 4957.44] + - [761, 4957.44] - - [512, 1024, 1, 3072] - - [668, 7104.07] + - [774, 7104.07] - - [4096, 1024, 1, 15360] - - [552, 9669.04] + - [658, 9669.04] - - [1, 256, 1, 2048] - - [594, 13.9262] + - [700, 13.9262] - - [1024, 1024, 1, 4160] - - [646, 8759.3] + - [752, 8759.3] - - [1024, 256, 1, 256] - - [653, 3728.37] + - [759, 3728.37] - - [2048, 256, 1, 384] - - [655, 6123.17] + - [761, 6123.17] - - [512, 256, 1, 2560] - - [609, 3809.64] + - [715, 3809.64] - - [4096, 512, 1, 3072] - - [557, 9215.19] + - [663, 9215.19] - - [1024, 256, 1, 4160] - - [644, 6293.49] + - [750, 6293.49] - - [4096, 512, 1, 13312] - - [554, 9367.32] + - [660, 9367.32] - - [4096, 1024, 1, 3840] - - [552, 9631.57] + - [658, 9631.57] - - [4096, 200, 1, 640] - - [650, 6206.16] + - [756, 6206.16] - - [32, 200, 1, 2048] - - [588, 303.507] + - [694, 303.507] - - [1024, 200, 1, 512] - - [644, 3713.19] + - [750, 3713.19] - - [1024, 1024, 1, 7168] - - [647, 8475.74] + - [753, 8475.74] - - [2048, 1024, 1, 3200] - - [552, 9271.34] + - [658, 9271.34] - - [512, 512, 1, 1536] - - [655, 5832.27] + - [761, 5832.27] - - [4096, 256, 1, 768] - - [660, 8066.07] + - [766, 8066.07] - - [2048, 256, 1, 6656] - - [644, 8034.87] + - [750, 8034.87] - - [1024, 256, 1, 896] - - [644, 5467.54] + - [750, 5467.54] - - [2048, 256, 1, 512] - - [655, 6465.31] + - [761, 6465.31] - - [2048, 200, 1, 3072] - - [655, 6165.78] + - [761, 6165.78] - - [128, 200, 1, 1024] - - [612, 692.87] + - [718, 692.87] - - [4096, 512, 1, 3840] - - [557, 9272.7] + - [663, 9272.7] - - [1024, 200, 1, 3200] - - [655, 4838.85] + - [761, 4838.85] - - [4096, 512, 1, 5632] - - [552, 9335.52] + - [658, 9335.52] - - [4096, 512, 1, 64] - - [587, 5275.95] + - [693, 5275.95] - - [1024, 512, 1, 2816] - - [644, 7816.68] + - [750, 7816.68] - - [4096, 256, 1, 7680] - - [650, 8795.5] + - [756, 8795.5] - - [4096, 200, 1, 1024] - - [660, 6448.91] + - [766, 6448.91] - - [1024, 512, 1, 12288] - - [614, 7624.67] + - [720, 7624.67] - - [2048, 1024, 1, 512] - - [557, 8436.16] + - [663, 8436.16] - - [128, 256, 1, 2048] - - [631, 1342.28] + - [737, 1342.28] - - [2048, 200, 1, 1792] - - [655, 6020.47] + - [761, 6020.47] - - [1024, 1024, 1, 2816] - - [646, 8670.5] + - [752, 8670.5] - - [2048, 512, 1, 1536] - - [657, 8466.32] + - [763, 8466.32] - - [4096, 256, 1, 3072] - - [654, 8631.47] + - [760, 8631.47] - - [1024, 200, 1, 1536] - - [636, 4577.7] + - [742, 4577.7] - - [1024, 256, 1, 1024] - - [644, 5491.82] + - [750, 5491.82] - - [4096, 512, 1, 8192] - - [557, 9325.64] + - [663, 9325.64] - - [128, 1024, 1, 512] - - [655, 2534.42] + - [761, 2534.42] - - [4096, 512, 1, 2304] - - [552, 9193.09] + - [658, 9193.09] - - [2048, 256, 1, 5632] - - [655, 7999.64] + - [761, 7999.64] - - [1024, 256, 1, 5120] - - [655, 6307.32] + - [761, 6307.32] - - [1024, 512, 1, 6656] - - [655, 8028.95] + - [761, 8028.95] - - [4096, 512, 1, 2816] - - [552, 9234.5] + - [658, 9234.5] - - [4096, 200, 1, 2080] - - [639, 6697.96] + - [745, 6697.96] - - [1024, 200, 1, 2304] - - [655, 4752.91] + - [761, 4752.91] - - [2048, 200, 1, 13312] - - [644, 6346.23] + - [750, 6346.23] - - [64, 1024, 1, 1024] - - [628, 1359.68] + - [734, 1359.68] - - [4096, 256, 1, 3584] - - [650, 8668.9] + - [756, 8668.9] - - [2048, 1024, 1, 7680] - - [552, 9365.88] + - [658, 9365.88] - - [1024, 256, 1, 1664] - - [644, 5907.57] + - [750, 5907.57] - - [1, 512, 1, 2048] - - [571, 23.5057] + - [677, 23.5057] - - [512, 512, 1, 1024] - - [644, 5360.23] + - [750, 5360.23] - - [2048, 256, 1, 8192] - - [616, 7665.31] + - [722, 7665.31] - - [2048, 512, 1, 512] - - [646, 7767.33] + - [752, 7767.33] - - [4096, 512, 1, 1920] - - [552, 9133.04] + - [658, 9133.04] - - [4096, 200, 1, 12288] - - [660, 6910.75] + - [766, 6910.75] - - [1024, 512, 1, 3072] - - [590, 7310.43] + - [696, 7310.43] - - [2048, 512, 1, 1152] - - [650, 8342.36] + - [756, 8342.36] - - [1024, 256, 1, 2080] - - [644, 6010.46] + - [750, 6010.46] - - [4096, 1024, 1, 32] - - [640, 4793.59] + - [746, 4793.59] - - [4096, 512, 1, 16640] - - [552, 9365.41] + - [658, 9365.41] - - [2048, 200, 1, 9216] - - [644, 6315.98] + - [750, 6315.98] - - [2048, 200, 1, 2560] - - [644, 6119.24] + - [750, 6119.24] - - [2048, 1024, 1, 1024] - - [552, 8628.69] + - [658, 8628.69] - - [2048, 256, 1, 4608] - - [644, 7951.39] + - [750, 7951.39] - - [512, 200, 1, 768] - - [596, 2132.51] + - [702, 2132.51] - - [128, 256, 1, 512] - - [596, 670.117] + - [702, 670.117] - - [4096, 512, 1, 1792] - - [557, 9127.01] + - [663, 9127.01] - - [4096, 1024, 1, 8192] - - [552, 9591.37] + - [658, 9591.37] - - [1024, 256, 1, 2816] - - [655, 6119.11] + - [761, 6119.11] - - [1024, 1024, 1, 13312] - - [647, 8529.37] + - [753, 8529.37] - - [2048, 1024, 1, 4160] - - [552, 9305.67] + - [658, 9305.67] - - [2048, 256, 1, 3584] - - [644, 7903.23] + - [750, 7903.23] - - [128, 200, 1, 2048] - - [612, 1135.91] + - [718, 1135.91] - - [4096, 512, 1, 10240] - - [554, 9339.59] + - [660, 9339.59] - - [4096, 512, 1, 512] - - [552, 8446.78] + - [658, 8446.78] - - [2048, 1024, 1, 6656] - - [552, 9331.75] + - [658, 9331.75] - - [1024, 512, 1, 640] - - [644, 6776.04] + - [750, 6776.04] - - [2048, 512, 1, 768] - - [646, 8085.51] + - [752, 8085.51] - - [2048, 200, 1, 1408] - - [644, 5880.17] + - [750, 5880.17] - - [4096, 200, 1, 2048] - - [660, 6691.71] + - [766, 6691.71] - - [1024, 1024, 1, 5632] - - [646, 8749.63] + - [752, 8749.63] - - [2048, 512, 1, 3584] - - [650, 8704.23] + - [756, 8704.23] - - [64, 512, 1, 512] - - [586, 667.983] + - [692, 667.983] - - [64, 200, 1, 512] - - [596, 251.388] + - [702, 251.388] - - [1024, 200, 1, 64] - - [551, 1310.82] + - [657, 1310.82] - - [512, 512, 1, 2304] - - [644, 6078.8] + - [750, 6078.8] - - [2048, 1024, 1, 14336] - - [552, 9321.94] + - [658, 9321.94] - - [4096, 512, 1, 11264] - - [554, 9339.95] + - [660, 9339.95] - - [4096, 512, 1, 128] - - [639, 6566.53] + - [745, 6566.53] - - [1024, 512, 1, 64] - - [659, 2953.84] + - [765, 2953.84] - - [4096, 512, 1, 768] - - [552, 8738.23] + - [658, 8738.23] - - [4096, 1024, 1, 11264] - - [552, 9637.78] + - [658, 9637.78] - - [1, 256, 1, 1024] - - [642, 8.93234] + - [748, 8.93234] - - [4096, 200, 1, 7680] - - [639, 6889.57] + - [745, 6889.57] - - [1024, 200, 1, 12288] - - [611, 5237.74] + - [717, 5237.74] - - [1024, 1024, 1, 1280] - - [646, 8418.17] + - [752, 8418.17] - - [4096, 1024, 1, 16640] - - [552, 9675.01] + - [658, 9675.01] - - [2048, 1024, 1, 5632] - - [552, 9327.85] + - [658, 9327.85] - - [1024, 200, 1, 15360] - - [611, 5386.63] + - [717, 5386.63] - - [1, 1024, 1, 1024] - - [661, 27.3499] + - [767, 27.3499] - - [2048, 256, 1, 16384] - - [622, 7652.75] + - [728, 7652.75] - - [4096, 512, 1, 12288] - - [554, 9359.51] + - [660, 9359.51] - - [2048, 200, 1, 896] - - [655, 5628.96] + - [761, 5628.96] - - [4096, 1024, 1, 5632] - - [552, 9626.78] + - [658, 9626.78] - - [2048, 256, 1, 32] - - [648, 1889.43] + - [754, 1889.43] - - [2048, 256, 1, 1280] - - [644, 7390.94] + - [750, 7390.94] - - [4096, 256, 1, 4096] - - [646, 8694.37] + - [752, 8694.37] - - [2048, 256, 1, 11264] - - [644, 8113.95] + - [750, 8113.95] - - [4096, 200, 1, 9216] - - [646, 6891.08] + - [752, 6891.08] - - [1024, 512, 1, 4096] - - [592, 7348.46] + - [698, 7348.46] - - [2048, 1024, 1, 10240] - - [554, 9095.91] + - [660, 9095.91] - - [4096, 1024, 1, 640] - - [552, 9115.68] + - [658, 9115.68] - - [128, 1024, 1, 2048] - - [545, 3270.51] + - [651, 3270.51] - - [4096, 200, 1, 3840] - - [639, 6836.26] + - [745, 6836.26] - - [1024, 1024, 1, 1920] - - [650, 8562.82] + - [756, 8562.82] - - [2048, 200, 1, 7168] - - [655, 6296.23] + - [761, 6296.23] - - [2048, 512, 1, 16384] - - [546, 8632.51] + - [652, 8632.51] - - [2048, 1024, 1, 12288] - - [552, 9158.08] + - [658, 9158.08] - - [4096, 1024, 1, 10240] - - [552, 9658.84] + - [658, 9658.84] - - [1024, 1024, 1, 8320] - - [654, 8799.58] + - [760, 8799.58] - - [1024, 256, 1, 9216] - - [644, 6375.23] + - [750, 6375.23] - - [4096, 256, 1, 1152] - - [639, 8301.09] + - [745, 8301.09] - - [512, 200, 1, 2560] - - [605, 3088.51] + - [711, 3088.51] - - [2048, 256, 1, 1920] - - [644, 7714.94] + - [750, 7714.94] - - [2048, 1024, 1, 4608] - - [552, 9305.7] + - [658, 9305.7] - - [512, 256, 1, 1024] - - [652, 2887.74] + - [758, 2887.74] - - [1024, 256, 1, 1920] - - [636, 5913.12] + - [742, 5913.12] - - [4096, 512, 1, 3584] - - [552, 9275.69] + - [658, 9275.69] - - [2048, 512, 1, 4160] - - [657, 8734.03] + - [763, 8734.03] - - [2048, 512, 1, 5632] - - [660, 8758.98] + - [766, 8758.98] - - [4096, 1024, 1, 4608] - - [552, 9657.22] + - [658, 9657.22] - - [4096, 1024, 1, 3328] - - [552, 9621.45] + - [658, 9621.45] - - [4096, 256, 1, 7168] - - [646, 8770.05] + - [752, 8770.05] - - [4096, 200, 1, 128] - - [660, 4458.33] + - [766, 4458.33] - - [2048, 200, 1, 5120] - - [644, 6176.91] + - [750, 6176.91] - - [1024, 1024, 1, 6656] - - [646, 8780.45] + - [752, 8780.45] - - [512, 1024, 1, 3200] - - [655, 7887.09] + - [761, 7887.09] - - [512, 200, 1, 2304] - - [545, 2991.09] + - [651, 2991.09] - - [2048, 1024, 1, 9216] - - [557, 9325.46] + - [663, 9325.46] - - [2048, 256, 1, 1536] - - [655, 7551.73] + - [761, 7551.73] - - [4096, 256, 1, 256] - - [660, 6932.83] + - [766, 6932.83] - - [2048, 512, 1, 1408] - - [657, 8430.86] + - [763, 8430.86] - - [1024, 256, 1, 384] - - [649, 4462.13] + - [755, 4462.13] - - [2048, 1024, 1, 2304] - - [552, 9174.94] + - [658, 9174.94] - - [4096, 512, 1, 6144] - - [554, 9284.25] + - [660, 9284.25] - - [1024, 200, 1, 14336] - - [543, 5268.57] + - [649, 5268.57] - - [1024, 512, 1, 2080] - - [655, 7736.47] + - [761, 7736.47] - - [2048, 512, 1, 2304] - - [657, 8616.07] + - [763, 8616.07] - - [4096, 512, 1, 15360] - - [557, 9362.17] + - [663, 9362.17] - - [1024, 256, 1, 32] - - [577, 1028.12] + - [683, 1028.12] - - [1024, 200, 1, 2816] - - [655, 4780.58] + - [761, 4780.58] - - [4096, 200, 1, 512] - - [646, 6054.23] + - [752, 6054.23] - - [4096, 1024, 1, 7168] - - [557, 9468.49] + - [663, 9468.49] - - [2048, 256, 1, 14336] - - [618, 7865.52] + - [724, 7865.52] - - [1024, 200, 1, 3072] - - [655, 4804.2] + - [761, 4804.2] - - [2048, 200, 1, 1280] - - [655, 5846.31] + - [761, 5846.31] - - [1024, 1024, 1, 2304] - - [646, 8633.32] + - [752, 8633.32] - - [4096, 1024, 1, 9216] - - [552, 9641.03] + - [658, 9641.03] - - [2048, 512, 1, 4608] - - [657, 8743.3] + - [763, 8743.3] - - [4096, 1024, 1, 7680] - - [552, 9684.86] + - [658, 9684.86] - - [4096, 256, 1, 6144] - - [657, 8757.24] + - [763, 8757.24] - - [4096, 256, 1, 896] - - [650, 8258.93] + - [756, 8258.93] - - [512, 256, 1, 1536] - - [634, 3065.36] + - [740, 3065.36] - - [1024, 256, 1, 512] - - [644, 4752.85] + - [750, 4752.85] - - [2048, 256, 1, 640] - - [644, 6776.04] + - [750, 6776.04] - - [256, 256, 1, 2048] - - [581, 2249.06] + - [687, 2249.06] - - [2048, 1024, 1, 8192] - - [552, 9178.17] + - [658, 9178.17] - - [4096, 200, 1, 16640] - - [544, 7009.59] + - [650, 7009.59] - - [256, 512, 1, 512] - - [556, 2511.66] + - [662, 2511.66] - - [2048, 512, 1, 384] - - [657, 7467.7] + - [763, 7467.7] - - [2048, 200, 1, 16384] - - [625, 6327.31] + - [731, 6327.31] - - [4096, 200, 1, 10240] - - [650, 6892.74] + - [756, 6892.74] - - [1024, 512, 1, 9216] - - [599, 7530.09] + - [705, 7530.09] - - [4096, 1024, 1, 64] - - [574, 6260.26] + - [680, 6260.26] - - [4096, 200, 1, 1920] - - [660, 6710.27] + - [766, 6710.27] - - [2048, 1024, 1, 1280] - - [552, 8998.34] + - [658, 8998.34] - - [1024, 200, 1, 3840] - - [644, 4873.87] + - [750, 4873.87] - - [256, 1024, 1, 512] - - [655, 4766.35] + - [761, 4766.35] - - [2048, 1024, 1, 3328] - - [552, 9275.2] + - [658, 9275.2] - - [1024, 256, 1, 16640] - - [609, 6837.22] + - [715, 6837.22] - - [4096, 512, 1, 14336] - - [557, 9354.42] + - [663, 9354.42] - - [1024, 1024, 1, 16640] - - [654, 8832.37] + - [760, 8832.37] - - [1024, 256, 1, 1152] - - [655, 5642.66] + - [761, 5642.66] - - [512, 512, 1, 512] - - [644, 4779.93] + - [750, 4779.93] - - [4096, 512, 1, 8320] - - [557, 9327.96] + - [663, 9327.96] - - [2048, 512, 1, 7680] - - [660, 8793.96] + - [766, 8793.96] - - [4096, 1024, 1, 6656] - - [552, 9667.03] + - [658, 9667.03] - - [1024, 512, 1, 3584] - - [655, 7900.57] + - [761, 7900.57] - - [1024, 1024, 1, 32] - - [640, 2974.78] + - [746, 2974.78] - - [512, 512, 1, 2816] - - [636, 6155.85] + - [742, 6155.85] - - [2048, 512, 1, 1664] - - [660, 8496.55] + - [766, 8496.55] - - [1024, 1024, 1, 14336] - - [546, 8624.74] + - [652, 8624.74] - - [2048, 200, 1, 2048] - - [655, 6029.86] + - [761, 6029.86] - - [1024, 1024, 1, 3584] - - [646, 8702.62] + - [752, 8702.62] - - [512, 200, 1, 1280] - - [560, 2350.75] + - [666, 2350.75] - - [4096, 256, 1, 6656] - - [660, 8788.41] + - [766, 8788.41] - - [4096, 256, 1, 4160] - - [637, 8728.44] + - [743, 8728.44] - - [128, 256, 1, 1024] - - [619, 859.589] + - [725, 859.589] - - [512, 200, 1, 3200] - - [560, 3376.85] + - [666, 3376.85] - - [2048, 512, 1, 9216] - - [643, 8806.4] + - [749, 8806.4] - - [2048, 1024, 1, 256] - - [639, 7713.76] + - [745, 7713.76] - - [1024, 256, 1, 2304] - - [655, 6015.83] + - [761, 6015.83] - - [1024, 200, 1, 8192] - - [655, 5022.02] + - [761, 5022.02] - - [2048, 256, 1, 3072] - - [572, 7515.09] + - [678, 7515.09] - - [2048, 256, 1, 8320] - - [644, 8063.68] + - [750, 8063.68] - - [4096, 512, 1, 1024] - - [554, 8824.41] + - [660, 8824.41] - - [1024, 512, 1, 3200] - - [644, 7866.39] + - [750, 7866.39] - - [1024, 512, 1, 896] - - [636, 7161.11] + - [742, 7161.11] - - [2048, 512, 1, 1280] - - [650, 8384.52] + - [756, 8384.52] - - [4096, 200, 1, 64] - - [559, 3260.6] + - [665, 3260.6] - - [1024, 256, 1, 6144] - - [665, 6143.72] + - [771, 6143.72] - - [1024, 200, 1, 2560] - - [644, 4762.89] + - [750, 4762.89] - - [1024, 1024, 1, 5120] - - [573, 8454.23] + - [679, 8454.23] - - [2048, 512, 1, 6656] - - [650, 8799.05] + - [756, 8799.05] - - [4096, 1024, 1, 1536] - - [552, 9503.37] + - [658, 9503.37] - - [1024, 1024, 1, 128] - - [575, 5825.52] + - [681, 5825.52] - - [512, 1024, 1, 1792] - - [644, 7701.12] + - [750, 7701.12] - - [2048, 1024, 1, 32] - - [555, 3938.41] + - [661, 3938.41] - - [4096, 256, 1, 2816] - - [639, 8652.2] + - [745, 8652.2] - - [1024, 1024, 1, 15360] - - [546, 8719.7] + - [652, 8719.7] - - [1024, 256, 1, 5632] - - [644, 6344.18] + - [750, 6344.18] - - [1024, 1024, 1, 4096] - - [647, 8187.86] + - [753, 8187.86] - - [2048, 200, 1, 4160] - - [655, 6222.48] + - [761, 6222.48] - - [512, 256, 1, 768] - - [586, 2771.67] + - [692, 2771.67] - - [4096, 512, 1, 640] - - [557, 8590.58] + - [663, 8590.58] - - [2048, 512, 1, 8192] - - [599, 8494.9] + - [705, 8494.9] - - [1024, 512, 1, 768] - - [644, 7049.35] + - [750, 7049.35] - - [4096, 200, 1, 8320] - - [639, 6908.7] + - [745, 6908.7] - - [2048, 512, 1, 896] - - [646, 8224.23] + - [752, 8224.23] - - [4096, 200, 1, 7168] - - [657, 6878.59] + - [763, 6878.59] - - [2048, 512, 1, 13312] - - [645, 8803.04] + - [751, 8803.04] - - [64, 512, 1, 1024] - - [549, 844.024] + - [655, 844.024] - - [2048, 200, 1, 3840] - - [644, 6192.48] + - [750, 6192.48] - - [1024, 1024, 1, 768] - - [637, 8098.51] + - [743, 8098.51] - - [4096, 512, 1, 16384] - - [557, 9345.73] + - [663, 9345.73] - - [4096, 256, 1, 2304] - - [637, 8596.45] + - [743, 8596.45] - - [1, 256, 1, 4096] - - [642, 19.9293] + - [748, 19.9293] - - [1024, 1024, 1, 11264] - - [647, 8491.48] + - [753, 8491.48] - - [2048, 200, 1, 16640] - - [641, 6510.64] + - [747, 6510.64] - - [1024, 256, 1, 3072] - - [655, 6179.55] + - [761, 6179.55] - - [4096, 1024, 1, 512] - - [552, 9032.25] + - [658, 9032.25] - - [2048, 256, 1, 2816] - - [644, 7793.57] + - [750, 7793.57] - - [32, 512, 1, 512] - - [556, 318.816] + - [662, 318.816] - - [256, 512, 1, 2048] - - [607, 3369.02] + - [713, 3369.02] - - [1024, 512, 1, 384] - - [655, 6198.58] + - [761, 6198.58] - - [2048, 200, 1, 7680] - - [644, 6307.7] + - [750, 6307.7] - - [1024, 512, 1, 4608] - - [655, 7953.48] + - [761, 7953.48] - - [4096, 200, 1, 32] - - [604, 2199.29] + - [710, 2199.29] - - [4096, 200, 1, 3328] - - [639, 6813.12] + - [745, 6813.12] - - [1024, 200, 1, 1152] - - [644, 4375.65] + - [750, 4375.65] - - [1024, 1024, 1, 1408] - - [646, 8457.91] + - [752, 8457.91] - - [2048, 200, 1, 15360] - - [620, 6333.1] + - [726, 6333.1] - - [512, 1024, 1, 2048] - - [630, 6280.76] + - [736, 6280.76] - - [1024, 512, 1, 1024] - - [655, 7064.19] + - [761, 7064.19] - - [1024, 200, 1, 10240] - - [644, 5030.69] + - [750, 5030.69] - - [4096, 256, 1, 5632] - - [657, 8765.22] + - [763, 8765.22] - - [512, 512, 1, 3072] - - [667, 5942.44] + - [773, 5942.44] - - [2048, 256, 1, 1408] - - [644, 7545.05] + - [750, 7545.05] - - [2048, 256, 1, 6144] - - [655, 7963.97] + - [761, 7963.97] - - [4096, 256, 1, 3328] - - [650, 8682.58] + - [756, 8682.58] - - [1024, 200, 1, 1664] - - [644, 4595.4] + - [750, 4595.4] - - [2048, 1024, 1, 1152] - - [552, 8942.65] + - [658, 8942.65] - - [2048, 512, 1, 6144] - - [645, 8729.71] + - [751, 8729.71] - - [2048, 512, 1, 3200] - - [646, 8696.56] + - [752, 8696.56] - - [4096, 1024, 1, 2080] - - [585, 9538.45] + - [691, 9538.45] - - [4096, 1024, 1, 768] - - [552, 9260.75] + - [658, 9260.75] - - [4096, 1024, 1, 2560] - - [552, 9567.27] + - [658, 9567.27] - - [64, 200, 1, 2048] - - [584, 583.161] + - [690, 583.161] - - [2048, 200, 1, 4608] - - [655, 6243.28] + - [761, 6243.28] - - [1024, 1024, 1, 6144] - - [647, 8320.25] + - [753, 8320.25] - - [4096, 256, 1, 1664] - - [650, 8503.17] + - [756, 8503.17] - - [2048, 200, 1, 384] - - [655, 4940.0] + - [761, 4940.0] - - [1, 200, 1, 2048] - - [601, 11.3281] + - [707, 11.3281] - - [4096, 256, 1, 1792] - - [660, 8504.12] + - [766, 8504.12] - - [2048, 1024, 1, 64] - - [574, 5309.35] + - [680, 5309.35] - - [4096, 1024, 1, 16384] - - [541, 9428.61] + - [647, 9428.61] - - [1024, 512, 1, 16640] - - [655, 8122.55] + - [761, 8122.55] - - [2048, 512, 1, 10240] - - [645, 8766.21] + - [751, 8766.21] - - [4096, 512, 1, 6656] - - [552, 9351.75] + - [658, 9351.75] - - [2048, 256, 1, 16640] - - [644, 8135.27] + - [750, 8135.27] - - [2048, 512, 1, 2816] - - [646, 8660.32] + - [752, 8660.32] - - [1024, 200, 1, 32] - - [564, 780.291] + - [670, 780.291] - - [1, 512, 1, 4096] - - [589, 34.8671] + - [695, 34.8671] - - [256, 256, 1, 1024] - - [596, 1490.08] + - [702, 1490.08] - - [2048, 1024, 1, 128] - - [569, 6605.3] + - [675, 6605.3] - - [2048, 1024, 1, 2080] - - [552, 9159.51] + - [658, 9159.51] - - [2048, 1024, 1, 16640] - - [552, 9371.65] + - [658, 9371.65] - - [1024, 200, 1, 384] - - [655, 3378.24] + - [761, 3378.24] - - [4096, 256, 1, 384] - - [600, 7369.3] + - [706, 7369.3] - - [4096, 256, 1, 13312] - - [654, 8776.48] + - [760, 8776.48] - - [2048, 256, 1, 128] - - [649, 4280.0] + - [755, 4280.0] - - [512, 256, 1, 2304] - - [561, 3584.98] + - [667, 3584.98] - - [2048, 1024, 1, 3072] - - [554, 9156.52] + - [660, 9156.52] - - [1024, 1024, 1, 640] - - [650, 7928.84] + - [756, 7928.84] - - [256, 512, 1, 1024] - - [655, 2843.7] + - [761, 2843.7] - - [4096, 1024, 1, 1408] - - [552, 9437.56] + - [658, 9437.56] - - [4096, 200, 1, 5632] - - [657, 6873.96] + - [763, 6873.96] - - [4096, 1024, 1, 2048] - - [552, 9437.1] + - [658, 9437.1] - - [2048, 1024, 1, 2560] - - [557, 9195.62] + - [663, 9195.62] - - [4096, 1024, 1, 128] - - [639, 7407.26] + - [745, 7407.26] - - [1024, 200, 1, 3328] - - [655, 4857.39] + - [761, 4857.39] - - [2048, 200, 1, 1152] - - [644, 5760.1] + - [750, 5760.1] - - [1024, 200, 1, 9216] - - [543, 5053.21] + - [649, 5053.21] - - [4096, 256, 1, 512] - - [637, 7617.45] + - [743, 7617.45] - - [4096, 1024, 1, 14336] - - [552, 9665.12] + - [658, 9665.12] - - [1024, 1024, 1, 384] - - [575, 7478.8] + - [681, 7478.8] - - [2048, 200, 1, 512] - - [644, 5150.28] + - [750, 5150.28] - - [2048, 256, 1, 9216] - - [623, 7717.71] + - [729, 7717.71] - - [2048, 256, 1, 1792] - - [644, 7655.94] + - [750, 7655.94] - - [4096, 512, 1, 9216] - - [554, 9331.22] + - [660, 9331.22] - - [4096, 200, 1, 15360] - - [544, 6958.14] + - [650, 6958.14] - - [1024, 512, 1, 2048] - - [643, 7067.91] + - [749, 7067.91] - - [64, 256, 1, 2048] - - [568, 723.256] + - [674, 723.256] - - [4096, 200, 1, 1792] - - [646, 6699.65] + - [752, 6699.65] - - [1, 200, 1, 4096] - - [578, 15.6387] + - [684, 15.6387] - - [2048, 1024, 1, 2048] - - [557, 9071.93] + - [663, 9071.93] - - [1024, 200, 1, 2080] - - [636, 4679.19] + - [742, 4679.19] - - [2048, 200, 1, 1536] - - [655, 5939.92] + - [761, 5939.92] - - [1024, 1024, 1, 3072] - - [617, 8333.15] + - [723, 8333.15] - - [512, 200, 1, 1792] - - [542, 2679.73] + - [648, 2679.73] - - [1024, 256, 1, 11264] - - [545, 6470.98] + - [651, 6470.98] - - [2048, 512, 1, 12288] - - [592, 8729.24] + - [698, 8729.24] - - [1024, 256, 1, 1792] - - [655, 5931.44] + - [761, 5931.44] - - [1024, 200, 1, 7168] - - [655, 4970.33] + - [761, 4970.33] - - [32, 256, 1, 1024] - - [566, 237.334] + - [672, 237.334] - - [512, 256, 1, 3072] - - [609, 3813.1] + - [715, 3813.1] - - [1024, 1024, 1, 2080] - - [646, 8600.41] + - [752, 8600.41] - - [2048, 200, 1, 2304] - - [655, 6093.32] + - [761, 6093.32] - - [4096, 512, 1, 1536] - - [552, 9075.0] + - [658, 9075.0] - - [2048, 256, 1, 7168] - - [655, 7895.26] + - [761, 7895.26] - - [2048, 512, 1, 1792] - - [657, 8531.92] + - [763, 8531.92] - - [1024, 200, 1, 2048] - - [644, 4685.43] + - [750, 4685.43] - - [1024, 1024, 1, 4608] - - [650, 8735.71] + - [756, 8735.71] - - [4096, 256, 1, 8192] - - [646, 8782.55] + - [752, 8782.55] - - [512, 1024, 1, 1280] - - [636, 7483.25] + - [742, 7483.25] - - [2048, 1024, 1, 16384] - - [546, 8878.96] + - [652, 8878.96] - - [512, 512, 1, 1280] - - [644, 5745.72] + - [750, 5745.72] - - [1024, 200, 1, 1280] - - [636, 4446.23] - - - [4096, 512, 1, 4096] - - [554, 9264.49] + - [742, 4446.23] - - [2048, 256, 1, 3200] - - [644, 7842.85] + - [750, 7842.85] - - [2048, 512, 1, 15360] - - [592, 8757.24] + - [698, 8757.24] - - [1024, 512, 1, 3328] - - [644, 7854.04] + - [750, 7854.04] - - [1024, 512, 1, 4160] - - [644, 7934.61] + - [750, 7934.61] - - [4096, 200, 1, 6656] - - [646, 6883.3] + - [752, 6883.3] - - [4096, 1024, 1, 1024] - - [552, 9229.44] + - [658, 9229.44] - - [2048, 200, 1, 3328] - - [655, 6182.74] + - [761, 6182.74] - - [1024, 1024, 1, 256] - - [575, 6932.83] + - [681, 6932.83] - - [512, 200, 1, 512] - - [596, 1910.77] + - [702, 1910.77] - - [2048, 256, 1, 64] - - [567, 2912.81] + - [673, 2912.81] - - [1024, 256, 1, 2560] - - [644, 6123.17] + - [750, 6123.17] - - [2048, 512, 1, 11264] - - [656, 8728.94] + - [762, 8728.94] - - [32, 200, 1, 1024] - - [651, 187.56] + - [757, 187.56] - - [32, 512, 1, 2048] - - [595, 694.521] + - [701, 694.521] - - [2048, 256, 1, 2304] - - [644, 7759.35] + - [750, 7759.35] - - [2048, 256, 1, 12288] - - [623, 7726.35] + - [729, 7726.35] - - [4096, 200, 1, 8192] - - [646, 6870.94] + - [752, 6870.94] - - [1024, 512, 1, 7168] - - [592, 7479.2] + - [698, 7479.2] - - [1024, 512, 1, 1792] - - [644, 7626.11] + - [750, 7626.11] - - [4096, 1024, 1, 1664] - - [552, 9503.54] + - [658, 9503.54] - - [4096, 200, 1, 2816] - - [639, 6775.44] + - [745, 6775.44] - - [1024, 1024, 1, 896] - - [646, 8229.99] + - [752, 8229.99] - - [1024, 200, 1, 8320] - - [607, 5173.58] + - [713, 5173.58] - - [1024, 1024, 1, 12288] - - [647, 8463.21] + - [753, 8463.21] - - [1024, 256, 1, 8320] - - [636, 6404.37] + - [742, 6404.37] - - [1024, 200, 1, 1024] - - [644, 4297.54] + - [750, 4297.54] - - [1024, 200, 1, 16640] - - [606, 5499.51] + - [712, 5499.51] - - [4096, 256, 1, 5120] - - [660, 8729.15] + - [766, 8729.15] - - [1024, 256, 1, 3200] - - [655, 6124.96] + - [761, 6124.96] - - [512, 512, 1, 2560] - - [655, 6109.79] + - [761, 6109.79] - - [4096, 256, 1, 2048] - - [660, 8511.05] + - [766, 8511.05] - - [1024, 256, 1, 640] - - [644, 5102.66] + - [750, 5102.66] - - [2048, 256, 1, 5120] - - [572, 7667.93] + - [678, 7667.93] - - [2048, 256, 1, 7680] - - [655, 8054.45] + - [761, 8054.45] - - [4096, 512, 1, 384] - - [650, 8190.77] + - [756, 8190.77] - - [2048, 200, 1, 3584] - - [644, 6166.12] + - [750, 6166.12] - - [1024, 512, 1, 1536] - - [644, 7517.9] + - [750, 7517.9] - - [4096, 512, 1, 3328] - - [552, 9259.45] + - [658, 9259.45] - - [4096, 1024, 1, 256] - - [552, 8341.79] + - [658, 8341.79] - - [2048, 200, 1, 64] - - [615, 2307.71] + - [721, 2307.71] - - [2048, 200, 1, 4096] - - [655, 6212.04] + - [761, 6212.04] - - [1024, 1024, 1, 1536] - - [646, 8484.15] + - [752, 8484.15] - - [2048, 1024, 1, 7168] - - [554, 9315.24] + - [660, 9315.24] - - [1024, 256, 1, 3584] - - [644, 6207.32] + - [750, 6207.32] - - [4096, 256, 1, 32] - - [648, 2892.72] + - [754, 2892.72] - - [4096, 256, 1, 1280] - - [657, 8392.9] + - [763, 8392.9] - - [512, 512, 1, 3200] - - [655, 6219.41] + - [761, 6219.41] - - [2048, 1024, 1, 1536] - - [554, 9052.55] + - [660, 9052.55] - - [2048, 256, 1, 1024] - - [644, 7192.9] + - [750, 7192.9] - - [128, 200, 1, 512] - - [634, 502.677] + - [740, 502.677] - - [4096, 512, 1, 7168] - - [557, 9329.11] + - [663, 9329.11] - - [1024, 512, 1, 1152] - - [644, 7358.53] + - [750, 7358.53] - - [64, 1024, 1, 2048] - - [562, 2102.51] + - [668, 2102.51] - - [2048, 512, 1, 3328] - - [646, 8694.69] + - [752, 8694.69] - - [4096, 1024, 1, 896] - - [552, 9343.02] + - [658, 9343.02] - - [1, 1024, 1, 2048] - - [602, 40.9324] + - [708, 40.9324] - - [4096, 200, 1, 3584] - - [650, 6810.3] + - [756, 6810.3] - - [4096, 1024, 1, 4096] - - [552, 9347.56] + - [658, 9347.56] - - [1024, 256, 1, 14336] - - [545, 6625.8] + - [651, 6625.8] - - [2048, 200, 1, 256] - - [644, 4413.3] + - [750, 4413.3] - - [4096, 256, 1, 16384] - - [546, 8752.13] + - [652, 8752.13] - - [4096, 256, 1, 1920] - - [637, 8533.78] + - [743, 8533.78] - - [32, 1024, 1, 512] - - [635, 647.369] + - [741, 647.369] - - [1024, 256, 1, 7680] - - [655, 6387.36] + - [761, 6387.36] - - [2048, 256, 1, 1664] - - [655, 7631.44] + - [761, 7631.44] - - [512, 200, 1, 1536] - - [560, 2576.88] + - [666, 2576.88] - - [2048, 1024, 1, 6144] - - [541, 9033.77] + - [647, 9033.77] - - [512, 256, 1, 2816] - - [607, 3977.46] + - [713, 3977.46] - - [4096, 512, 1, 4160] - - [554, 9289.02] + - [660, 9289.02] - - [4096, 512, 1, 2080] - - [633, 9150.28] + - [739, 9150.28] - - [2048, 256, 1, 15360] - - [618, 7963.97] + - [724, 7963.97] - - [4096, 200, 1, 5120] - - [657, 6861.62] + - [763, 6861.62] - - [1024, 512, 1, 8192] - - [643, 7473.25] + - [749, 7473.25] - - [4096, 200, 1, 896] - - [660, 6443.25] + - [766, 6443.25] - - [2048, 512, 1, 8320] - - [650, 8810.24] + - [756, 8810.24] - - [1024, 1024, 1, 10240] - - [658, 8436.7] + - [764, 8436.7] - - [1024, 200, 1, 768] - - [644, 4087.58] + - [750, 4087.58] - - [2048, 200, 1, 640] - - [655, 5416.3] + - [761, 5416.3] - - [512, 200, 1, 2048] - - [609, 2702.62] + - [715, 2702.62] - - [1024, 1024, 1, 9216] - - [647, 8499.08] + - [753, 8499.08] - - [4096, 200, 1, 1408] - - [657, 6613.82] + - [763, 6613.82] - - [1024, 256, 1, 13312] - - [545, 6643.54] + - [651, 6643.54] - - [1024, 256, 1, 128] - - [576, 2706.1] + - [682, 2706.1] - - [2048, 200, 1, 5632] - - [655, 6270.12] + - [761, 6270.12] - - [64, 1024, 1, 512] - - [634, 1310.82] + - [740, 1310.82] - - [1024, 512, 1, 2560] - - [655, 7731.54] + - [761, 7731.54] - - [4096, 200, 1, 1280] - - [637, 6566.83] + - [743, 6566.83] - - [1024, 200, 1, 4096] - - [655, 4911.46] + - [761, 4911.46] - - [1024, 1024, 1, 2560] - - [646, 8630.35] + - [752, 8630.35] - - [2048, 512, 1, 64] - - [650, 4152.88] + - [756, 4152.88] - - [2048, 200, 1, 8192] - - [644, 6234.21] + - [750, 6234.21] - - [2048, 512, 1, 3072] - - [654, 8614.85] + - [760, 8614.85] - - [4096, 1024, 1, 5120] - - [552, 9573.75] + - [658, 9573.75] - - [4096, 256, 1, 640] - - [639, 7913.88] + - [745, 7913.88] - - [1024, 256, 1, 1280] - - [644, 5706.64] + - [750, 5706.64] - - [2048, 1024, 1, 1920] - - [554, 9141.34] + - [660, 9141.34] - - [2048, 256, 1, 4096] - - [644, 7937.28] + - [750, 7937.28] - - [2048, 1024, 1, 15360] - - [557, 9351.96] + - [663, 9351.96] - - [4096, 200, 1, 16384] - - [546, 6975.21] + - [652, 6975.21] - - [1, 1024, 1, 4096] - - [664, 60.7815] + - [770, 60.7815] - - [4096, 1024, 1, 2816] - - [552, 9583.98] + - [658, 9583.98] - - [4096, 200, 1, 1664] - - [639, 6658.7] + - [745, 6658.7] - - [4096, 512, 1, 256] - - [570, 7731.54] + - [676, 7731.54] - - [1024, 200, 1, 896] - - [644, 4193.45] + - [750, 4193.45] - - [2048, 200, 1, 6656] - - [655, 6291.17] + - [761, 6291.17] - - [2048, 1024, 1, 5120] - - [554, 9270.57] + - [660, 9270.57] - - [512, 1024, 1, 768] - - [644, 7099.06] + - [750, 7099.06] - - [2048, 512, 1, 14336] - - [624, 8559.13] + - [730, 8559.13] - - [2048, 200, 1, 8320] - - [644, 6314.72] + - [750, 6314.72] - - [4096, 256, 1, 3840] - - [660, 8718.56] + - [766, 8718.56] - - [2048, 1024, 1, 4096] - - [541, 8973.38] + - [647, 8973.38] - - [1024, 1024, 1, 3200] - - [650, 8701.98] + - [756, 8701.98] - - [1024, 256, 1, 4608] - - [644, 6268.05] + - [750, 6268.05] - - [4096, 512, 1, 4608] - - [552, 9316.47] + - [658, 9316.47] - - [2048, 512, 1, 2048] - - [643, 8462.76] + - [749, 8462.76] - - [4096, 512, 1, 1664] - - [552, 9074.53] + - [658, 9074.53] - - [4096, 256, 1, 4608] - - [639, 8718.05] + - [745, 8718.05] - - [1024, 512, 1, 32] - - [632, 1807.99] + - [738, 1807.99] - - [1024, 512, 1, 3840] - - [644, 7936.34] + - [750, 7936.34] - - [2048, 512, 1, 1920] - - [660, 8548.27] + - [766, 8548.27] - - [2048, 1024, 1, 896] - - [552, 8843.51] + - [658, 8843.51] - - [4096, 200, 1, 6144] - - [660, 6864.76] + - [766, 6864.76] - - [1024, 512, 1, 13312] - - [613, 7763.19] + - [719, 7763.19] - - [4096, 1024, 1, 4160] - - [552, 9650.72] + - [658, 9650.72] - - [2048, 200, 1, 2816] - - [644, 6119.76] + - [750, 6119.76] - - [1024, 1024, 1, 3840] - - [639, 8709.5] + - [745, 8709.5] - - [128, 1024, 1, 1024] - - [662, 2577.25] + - [768, 2577.25] - - [2048, 1024, 1, 11264] - - [557, 9339.06] + - [663, 9339.06] - - [2048, 1024, 1, 384] - - [646, 8210.81] + - [752, 8210.81] - - [1024, 256, 1, 2048] - - [667, 5755.58] + - [773, 5755.58] - - [2048, 1024, 1, 3840] - - [554, 9288.96] + - [660, 9288.96] - - [4096, 256, 1, 8320] - - [660, 8812.38] + - [766, 8812.38] - - [2048, 256, 1, 3840] - - [636, 7857.05] + - [742, 7857.05] - - [64, 256, 1, 512] - - [634, 336.182] + - [740, 336.182] - - [4096, 512, 1, 1280] - - [554, 8993.52] + - [660, 8993.52] - - [512, 256, 1, 1280] - - [586, 2996.03] + - [692, 2996.03] - - [1024, 512, 1, 7680] - - [644, 8041.59] + - [750, 8041.59] - - [4096, 1024, 1, 1152] - - [552, 9368.48] + - [658, 9368.48] - - [256, 200, 1, 512] - - [586, 993.07] + - [692, 993.07] - - [256, 1024, 1, 2048] - - [663, 4759.59] + - [769, 4759.59] - - [2048, 200, 1, 10240] - - [655, 6329.03] + - [761, 6329.03] - - [2048, 512, 1, 5120] - - [656, 8732.56] + - [762, 8732.56] - - [2048, 1024, 1, 1408] - - [554, 9006.9] + - [660, 9006.9] - - [512, 1024, 1, 512] - - [644, 6528.2] + - [750, 6528.2] - - [1024, 200, 1, 11264] - - [611, 5194.82] + - [717, 5194.82] - - [512, 1024, 1, 1024] - - [597, 6337.1] + - [703, 6337.1] - - [2048, 512, 1, 32] - - [563, 2777.78] + - [669, 2777.78] - - [4096, 256, 1, 2560] - - [646, 8621.49] + - [752, 8621.49] - - [4096, 256, 1, 64] - - [580, 4194.4] + - [686, 4194.4] - - [32, 1024, 1, 1024] - - [581, 778.264] + - [687, 778.264] - - [2048, 200, 1, 768] - - [655, 5507.33] + - [761, 5507.33] - - [512, 512, 1, 2048] - - [603, 5338.91] + - [709, 5338.91] - - [2048, 512, 1, 2560] - - [657, 8643.69] + - [763, 8643.69] - - [512, 256, 1, 512] - - [636, 2542.1] + - [742, 2542.1] - - [1024, 200, 1, 7680] - - [611, 5047.8] + - [717, 5047.8] - - [4096, 512, 1, 896] - - [552, 8856.85] + - [658, 8856.85] - - [4096, 1024, 1, 3072] - - [552, 9492.17] + - [658, 9492.17] - - [4096, 200, 1, 13312] - - [544, 6900.73] + - [650, 6900.73] - - [2048, 512, 1, 7168] - - [645, 8788.1] + - [751, 8788.1] - - [2048, 1024, 1, 2816] - - [557, 9229.88] + - [663, 9229.88] - - [2048, 512, 1, 128] - - [575, 5630.04] + - [681, 5630.04] - - [1024, 256, 1, 8192] - - [667, 6203.83] + - [773, 6203.83] - - [4096, 1024, 1, 1792] - - [552, 9510.42] + - [658, 9510.42] - - [1024, 200, 1, 6656] - - [636, 5002.85] + - [742, 5002.85] - - [1024, 1024, 1, 1024] - - [573, 8095.26] + - [679, 8095.26] - - [4096, 200, 1, 2304] - - [657, 6754.45] + - [763, 6754.45] - - [4096, 512, 1, 1152] - - [552, 8974.54] + - [658, 8974.54] - - [512, 200, 1, 1024] - - [634, 2233.01] + - [740, 2233.01] - - [1024, 256, 1, 3840] - - [655, 6244.72] + - [761, 6244.72] - - [512, 512, 1, 768] - - [644, 5331.84] + - [750, 5331.84] - - [2048, 512, 1, 4096] - - [654, 8621.76] + - [760, 8621.76] - - [2048, 256, 1, 2560] - - [644, 7770.93] + - [750, 7770.93] - - [2048, 256, 1, 4160] - - [655, 7923.08] + - [761, 7923.08] - - [1024, 256, 1, 64] - - [551, 1705.1] + - [657, 1705.1] - - [4096, 512, 1, 7680] - - [552, 9364.57] + - [658, 9364.57] - - [1024, 512, 1, 1664] - - [655, 7594.24] + - [761, 7594.24] - - [2048, 512, 1, 2080] - - [646, 8570.67] + - [752, 8570.67] - - [2048, 512, 1, 3840] - - [657, 8729.14] + - [763, 8729.14] - - [4096, 1024, 1, 384] - - [552, 8764.86] + - [658, 8764.86] - - [4096, 200, 1, 3072] - - [646, 6772.39] + - [752, 6772.39] - - [1024, 512, 1, 14336] - - [614, 7680.97] + - [720, 7680.97] - - [1024, 200, 1, 1920] - - [636, 4637.08] + - [742, 4637.08] - - [1024, 1024, 1, 1664] - - [650, 8506.49] + - [756, 8506.49] - - [512, 1024, 1, 2304] - - [644, 7775.33] + - [750, 7775.33] - - [2048, 1024, 1, 1792] - - [552, 9123.46] + - [658, 9123.46] - - [32, 200, 1, 512] - - [652, 125.744] + - [758, 125.744] - - [4096, 256, 1, 11264] - - [657, 8822.31] + - [763, 8822.31] - - [4096, 256, 1, 1408] - - [657, 8419.32] + - [763, 8419.32] - - [1024, 256, 1, 7168] - - [644, 6377.54] + - [750, 6377.54] - - [2048, 256, 1, 1152] - - [655, 7401.81] + - [761, 7401.81] - - [256, 256, 1, 512] - - [634, 1314.93] + - [740, 1314.93] - - [1024, 512, 1, 1280] - - [644, 7410.53] + - [750, 7410.53] - - [512, 512, 1, 1792] - - [636, 5931.44] + - [742, 5931.44] - - [2048, 200, 1, 12288] - - [618, 6242.25] + - [724, 6242.25] - - [2048, 200, 1, 1664] - - [655, 5953.75] + - [761, 5953.75] - - [4096, 200, 1, 4608] - - [650, 6853.54] + - [756, 6853.54] - - [512, 1024, 1, 2560] - - [644, 7778.13] + - [750, 7778.13] - - [4096, 200, 1, 384] - - [637, 5765.73] + - [743, 5765.73] - - [128, 512, 1, 512] - - [634, 1302.68] + - [740, 1302.68] - - [1024, 200, 1, 256] - - [638, 2861.93] + - [744, 2861.93] - - [256, 1024, 1, 1024] - - [579, 4522.26] + - [685, 4522.26] - - [2048, 200, 1, 128] - - [644, 3310.0] + - [750, 3310.0] - - [2048, 200, 1, 11264] - - [625, 6168.2] + - [731, 6168.2] - - [1024, 512, 1, 1920] - - [655, 7649.29] + - [761, 7649.29] - - [4096, 256, 1, 1536] - - [650, 8427.33] + - [756, 8427.33] - - [4096, 1024, 1, 3584] - - [552, 9618.0] + - [658, 9618.0] - - [2048, 256, 1, 256] - - [644, 5464.99] + - [750, 5464.99] - - [2048, 1024, 1, 768] - - [552, 8726.87] + - [658, 8726.87] - - [4096, 256, 1, 10240] - - [646, 8790.89] + - [752, 8790.89] - - [2048, 256, 1, 10240] - - [626, 7665.31] + - [732, 7665.31] - - [4096, 200, 1, 14336] - - [660, 6916.18] + - [766, 6916.18] - - [1024, 512, 1, 5120] - - [598, 7420.36] + - [704, 7420.36] - - [1024, 512, 1, 8320] - - [655, 8061.31] + - [761, 8061.31] - - [256, 200, 1, 2048] - - [610, 1916.36] + - [716, 1916.36] - - [1024, 200, 1, 640] - - [638, 3873.39] + - [744, 3873.39] - - [1024, 512, 1, 10240] - - [643, 7526.9] + - [749, 7526.9] - - [1024, 200, 1, 4160] - - [655, 4928.19] + - [761, 4928.19] - - [1024, 200, 1, 5632] - - [636, 4978.66] + - [742, 4978.66] - - [1024, 1024, 1, 2048] - - [591, 7937.28] + - [697, 7937.28] - - [1024, 256, 1, 6656] - - [655, 6373.68] + - [761, 6373.68] - - [2048, 1024, 1, 8320] - - [552, 9333.15] + - [658, 9333.15] - - [1024, 256, 1, 10240] - - [644, 6407.29] + - [750, 6407.29] - - [2048, 256, 1, 2080] - - [644, 7714.58] + - [750, 7714.58] - - [4096, 256, 1, 128] - - [558, 5765.47] + - [664, 5765.47] - - [1024, 256, 1, 768] - - [649, 5210.42] + - [755, 5210.42] - - [2048, 256, 1, 896] - - [655, 7267.46] + - [761, 7267.46] - - [64, 512, 1, 2048] - - [621, 1296.64] + - [727, 1296.64] - - [4096, 512, 1, 2048] - - [554, 9121.25] + - [660, 9121.25] - - [512, 256, 1, 2048] - - [607, 3283.31] + - [713, 3283.31] - - [4096, 256, 1, 16640] - - [639, 8839.88] + - [745, 8839.88] - - [4096, 512, 1, 2560] - - [557, 9222.15] + - [663, 9222.15] - - [1024, 512, 1, 15360] - - [608, 7865.66] + - [714, 7865.66] - - [4096, 1024, 1, 2304] - - [552, 9558.26] + - [658, 9558.26] - - [4096, 200, 1, 1152] - - [657, 6531.93] + - [763, 6531.93] - - [2048, 200, 1, 6144] - - [655, 6277.75] + - [761, 6277.75] - - [1024, 1024, 1, 7680] - - [650, 8799.34] + - [756, 8799.34] - - [2048, 200, 1, 1920] - - [655, 6031.02] + - [761, 6031.02] - - [32, 1024, 1, 2048] - - [629, 1174.98] + - [735, 1174.98] - - [1024, 200, 1, 3584] - - [636, 4880.44] + - [742, 4880.44] - - [4096, 256, 1, 2080] - - [643, 8557.22] + - [749, 8557.22] - - [1024, 1024, 1, 16384] - - [544, 8618.65] + - [650, 8618.65] - - [1024, 256, 1, 1408] - - [655, 5803.54] + - [761, 5803.54] - - [1024, 256, 1, 4096] - - [665, 6037.78] + - [771, 6037.78] - - [2048, 200, 1, 14336] - - [655, 6364.48] + - [761, 6364.48] - - [4096, 512, 1, 5120] - - [554, 9302.05] + - [660, 9302.05] - - [1024, 512, 1, 6144] - - [590, 7469.09] + - [696, 7469.09] - - [1024, 512, 1, 2304] - - [655, 7759.35] + - [761, 7759.35] - - [4096, 200, 1, 4160] - - [639, 6843.22] + - [745, 6843.22] - - [4096, 200, 1, 1536] - - [650, 6628.27] + - [756, 6628.27] - - [4096, 1024, 1, 6144] - - [552, 9593.08] - - - [1280, 384, 1, 64] - - [683, 3196.98] + - [658, 9593.08] - - [256, 64, 1, 1225] - - [684, 1194.77] + - [790, 1194.77] - - [2048, 320, 1, 64] - - [686, 3449.36] - - - [256, 48, 1, 1225] - - [677, 913.498] - - - [2048, 192, 1, 64] - - [676, 2516.68] + - [792, 3449.36] - - [1024, 128, 1, 289] - - [690, 2869.78] - - - [1280, 192, 1, 64] - - [669, 1872.56] - - - [192, 32, 1, 1225] - - [674, 505.906] - - - [1280, 448, 1, 64] - - [670, 3078.97] + - [796, 2869.78] - - [384, 64, 1, 1225] - - [675, 1511.43] + - [781, 1511.43] - - [2048, 384, 1, 64] - - [688, 3836.35] - - - [288, 48, 1, 1225] - - [671, 1032.69] + - [794, 3836.35] - - [64, 80, 1, 5329] - - [687, 888.267] + - [793, 888.267] - - [1024, 384, 1, 289] - - [681, 4291.62] + - [787, 4291.62] - - [2048, 448, 1, 64] - - [680, 3783.62] - - - [1280, 320, 1, 64] - - [686, 2777.05] - - - [192, 64, 1, 1225] - - [671, 926.997] - - - [384, 192, 1, 1225] - - [682, 2560.1] - - - [1536, 256, 1, 64] - - [689, 2621.54] - - - [192, 48, 1, 1225] - - [674, 698.714] - - - [768, 128, 1, 289] - - [691, 2291.22] - - - [1024, 256, 1, 289] - - [689, 4064.46] + - [786, 3783.62] - - [768, 192, 1, 289] - - [685, 2690.43] - - - [1536, 384, 1, 64] - - [672, 3145.83] + - [791, 2690.43] - - [288, 64, 1, 1225] - - [674, 1142.77] - - - [1024, 192, 1, 289] - - [679, 3243.23] + - [780, 1142.77] - - [384, 96, 1, 1225] - - [692, 1844.81] - - - [160, 64, 1, 5329] - - [678, 1564.58] - - - [768, 160, 1, 289] - - [673, 2386.68] + - [798, 1844.81] - - [1024, 3392, 1, 4096] - - [718, 8503.02] + - [824, 8503.02] - - [1024, 3301, 1, 4096] - - [720, 8414.1] + - [826, 8414.1] - - [1024, 3443, 1, 4096] - - [707, 8536.59] + - [813, 8536.59] - - [132, 134, 480, 64] - - [745, 4149.27] + - [851, 4149.27] - - [162, 162, 400, 64] - - [733, 5539.73] + - [839, 5539.73] - - [4096, 3548, 1, 1024] - - [699, 9773.01] + - [805, 9773.01] - - [4096, 2977, 1, 1024] - - [700, 9574.43] + - [806, 9574.43] - - [132, 135, 480, 64] - - [745, 4167.51] + - [851, 4167.51] - - [1024, 2985, 1, 4096] - - [703, 9133.99] + - [809, 9133.99] - - [33708, 3681, 1, 1024] - - [700, 10033.8] + - [806, 10033.8] - - [4096, 3443, 1, 1024] - - [700, 9513.78] + - [806, 9513.78] - - [11, 11, 5456, 64] - - [742, 627.346] + - [848, 627.346] - - [1024, 3400, 1, 4096] - - [721, 8420.02] + - [827, 8420.02] - - [4096, 3995, 1, 1024] - - [699, 9693.87] + - [805, 9693.87] - - [4096, 3190, 1, 1024] - - [699, 9474.84] + - [805, 9474.84] - - [4096, 3594, 1, 1024] - - [700, 9315.83] + - [806, 9315.83] - - [159, 162, 400, 64] - - [732, 5429.98] + - [838, 5429.98] - - [1024, 3565, 1, 4096] - - [715, 8532.8] + - [821, 8532.8] - - [4096, 3422, 1, 1024] - - [700, 9459.24] + - [806, 9459.24] - - [1024, 3214, 1, 4096] - - [720, 8064.92] + - [826, 8064.92] - - [33708, 3584, 1, 1024] - - [701, 10129.0] + - [807, 10129.0] - - [33708, 3640, 1, 1024] - - [698, 9919.22] + - [804, 9919.22] - - [4096, 3263, 1, 1024] - - [698, 9699.35] + - [804, 9699.35] - - [4096, 3296, 1, 1024] - - [698, 9780.8] + - [804, 9780.8] - - [1024, 3557, 1, 4096] - - [719, 8526.89] + - [825, 8526.89] - - [4096, 3463, 1, 1024] - - [698, 9578.13] + - [804, 9578.13] - - [4096, 3528, 1, 1024] - - [698, 9739.92] + - [804, 9739.92] - - [14, 14, 4368, 64] - - [730, 991.276] + - [836, 991.276] - - [4096, 3226, 1, 1024] - - [698, 9587.19] + - [804, 9587.19] - - [4096, 3439, 1, 1024] - - [701, 9499.72] + - [807, 9499.72] - - [1024, 3523, 1, 4096] - - [721, 8393.58] + - [827, 8393.58] - - [1024, 3098, 1, 4096] - - [727, 7882.87] + - [833, 7882.87] - - [4096, 3121, 1, 1024] - - [698, 9296.23] + - [804, 9296.23] - - [33708, 3894, 1, 1024] - - [699, 9952.27] + - [805, 9952.27] - - [1024, 3548, 1, 4096] - - [705, 8432.45] + - [811, 8432.45] - - [1024, 3451, 1, 4096] - - [718, 8456.44] + - [824, 8456.44] - - [4096, 3353, 1, 1024] - - [700, 9289.08] + - [806, 9289.08] - - [4096, 3402, 1, 1024] - - [700, 9406.44] + - [806, 9406.44] - - [4096, 3939, 1, 1024] - - [698, 9549.59] + - [804, 9549.59] - - [133, 133, 480, 64] - - [745, 4124.31] + - [851, 4124.31] - - [1024, 3559, 1, 4096] - - [720, 8587.04] + - [826, 8587.04] - - [1024, 2977, 1, 4096] - - [703, 9084.59] + - [809, 9084.59] - - [1024, 3478, 1, 4096] - - [714, 8342.85] + - [820, 8342.85] - - [134, 134, 480, 64] - - [747, 4204.43] + - [853, 4204.43] - - [1024, 3368, 1, 4096] - - [720, 8277.43] + - [826, 8277.43] - - [4096, 4012, 1, 1024] - - [700, 9726.57] + - [806, 9726.57] - - [4096, 3486, 1, 1024] - - [698, 9639.71] + - [804, 9639.71] - - [1024, 3479, 1, 4096] - - [708, 8420.37] + - [814, 8420.37] - - [1024, 3505, 1, 4096] - - [720, 8310.66] + - [826, 8310.66] - - [4096, 3381, 1, 1024] - - [701, 9357.75] + - [807, 9357.75] - - [4096, 3430, 1, 1024] - - [698, 9482.36] + - [804, 9482.36] - - [1024, 3554, 1, 4096] - - [720, 8592.38] + - [826, 8592.38] - - [4096, 3271, 1, 1024] - - [698, 9715.41] + - [804, 9715.41] - - [1024, 3063, 1, 4096] - - [702, 9388.56] + - [808, 9388.56] - - [1024, 3209, 1, 4096] - - [720, 8212.74] + - [826, 8212.74] - - [4096, 3503, 1, 1024] - - [700, 9680.59] + - [806, 9680.59] - - [4096, 3344, 1, 1024] - - [698, 9268.55] + - [804, 9268.55] - - [1024, 3147, 1, 4096] - - [721, 8037.2] + - [827, 8037.2] - - [1024, 3322, 1, 4096] - - [719, 8356.32] + - [825, 8356.32] - - [1024, 3341, 1, 4096] - - [720, 8316.33] + - [826, 8316.33] - - [1024, 3516, 1, 4096] - - [702, 8397.12] + - [808, 8397.12] - - [102, 101, 624, 64] - - [733, 4709.59] + - [839, 4709.59] - - [1024, 3454, 1, 4096] - - [719, 8425.6] + - [825, 8425.6] - - [4096, 3969, 1, 1024] - - [700, 9640.15] + - [806, 9640.15] - - [4096, 3466, 1, 1024] - - [700, 9576.83] + - [806, 9576.83] - - [1024, 3999, 1, 1024] - - [703, 9207.15] + - [809, 9207.15] - - [1024, 4032, 1, 1024] - - [704, 9294.56] + - [810, 9294.56] - - [1024, 3403, 1, 4096] - - [718, 8357.97] + - [824, 8357.97] - - [4096, 3361, 1, 1024] - - [700, 9308.78] + - [806, 9308.78] - - [1024, 3527, 1, 4096] - - [719, 8512.19] + - [825, 8512.19] - - [1024, 3822, 1, 4096] - - [703, 8991.13] + - [809, 8991.13] - - [4096, 3315, 1, 1024] - - [698, 9834.96] + - [804, 9834.96] - - [232, 232, 272, 64] - - [732, 6481.62] + - [838, 6481.62] - - [1024, 3336, 1, 4096] - - [721, 8295.61] + - [827, 8295.61] - - [228, 232, 272, 64] - - [733, 6327.85] + - [839, 6327.85] - - [4096, 3547, 1, 1024] - - [698, 9781.56] + - [804, 9781.56] - - [4096, 3340, 1, 1024] - - [700, 9269.72] + - [806, 9269.72] - - [1024, 3906, 1, 1024] - - [704, 9018.38] + - [810, 9018.38] - - [1024, 3295, 1, 4096] - - [718, 8194.83] + - [824, 8194.83] - - [4096, 3294, 1, 1024] - - [701, 9762.16] + - [807, 9762.16] - - [33708, 3968, 1, 1024] - - [701, 10147.8] + - [807, 10147.8] - - [1024, 3473, 1, 4096] - - [707, 8318.68] + - [813, 8318.68] - - [1024, 3072, 1, 4096] - - [704, 9370.13] + - [810, 9370.13] - - [4096, 3189, 1, 1024] - - [698, 9470.26] + - [804, 9470.26] - - [4096, 3494, 1, 1024] - - [698, 9661.32] + - [804, 9661.32] - - [1024, 3522, 1, 4096] - - [721, 8459.23] + - [827, 8459.23] - - [33708, 3944, 1, 1024] - - [701, 10060.2] + - [807, 10060.2] - - [135, 135, 480, 64] - - [746, 4257.03] + - [852, 4257.03] - - [4096, 3421, 1, 1024] - - [698, 9456.98] + - [804, 9456.98] - - [32, 32, 1984, 64] - - [743, 3436.24] + - [849, 3436.24] - - [4096, 3311, 1, 1024] - - [698, 9810.88] + - [804, 9810.88] - - [1024, 3990, 1, 1024] - - [705, 9197.74] + - [811, 9197.74] - - [1024, 3290, 1, 4096] - - [718, 8229.63] + - [824, 8229.63] - - [4096, 3565, 1, 1024] - - [699, 9824.48] + - [805, 9824.48] - - [1024, 3484, 1, 4096] - - [708, 8575.38] + - [814, 8575.38] - - [4096, 3384, 1, 1024] - - [698, 9366.54] + - [804, 9366.54] - - [1024, 3422, 1, 4096] - - [718, 8484.12] + - [824, 8484.12] - - [4096, 3681, 1, 1024] - - [699, 9520.16] + - [805, 9520.16] - - [1024, 3584, 1, 1024] - - [725, 8583.37] + - [831, 8583.37] - - [4096, 4050, 1, 1024] - - [700, 9807.35] + - [806, 9807.35] - - [1024, 3996, 1, 4096] - - [701, 9181.7] + - [807, 9181.7] - - [4096, 3169, 1, 1024] - - [699, 9411.4] + - [805, 9411.4] - - [4096, 3538, 1, 1024] - - [699, 9765.99] + - [805, 9765.99] - - [1024, 3495, 1, 4096] - - [705, 8295.95] + - [811, 8295.95] - - [4096, 3401, 1, 1024] - - [698, 9402.68] + - [804, 9402.68] - - [1024, 3560, 1, 4096] - - [719, 8513.45] + - [825, 8513.45] - - [133, 135, 480, 64] - - [746, 4199.08] + - [852, 4199.08] - - [1024, 3263, 1, 4096] - - [720, 8172.23] + - [826, 8172.23] - - [1024, 3870, 1, 4096] - - [700, 8996.27] + - [806, 8996.27] - - [4096, 3555, 1, 1024] - - [701, 9811.88] + - [807, 9811.88] - - [4096, 3412, 1, 1024] - - [698, 9432.09] + - [804, 9432.09] - - [101, 101, 624, 64] - - [732, 4667.69] + - [838, 4667.69] - - [1024, 3296, 1, 4096] - - [719, 8350.61] + - [825, 8350.61] - - [1024, 3379, 1, 4096] - - [721, 8432.94] + - [827, 8432.94] - - [4096, 3302, 1, 1024] - - [698, 9796.39] + - [804, 9796.39] - - [1024, 3490, 1, 4096] - - [718, 8538.44] + - [824, 8538.44] - - [1024, 3428, 1, 4096] - - [719, 8531.67] + - [825, 8531.67] - - [1024, 3976, 1, 4096] - - [700, 9327.87] + - [806, 9327.87] - - [4096, 3485, 1, 1024] - - [698, 9628.82] + - [804, 9628.82] - - [4096, 3534, 1, 1024] - - [698, 9755.97] + - [804, 9755.97] - - [1024, 3064, 1, 4096] - - [704, 9196.98] + - [810, 9196.98] - - [4096, 3216, 1, 1024] - - [700, 9563.44] + - [806, 9563.44] - - [1024, 3450, 1, 4096] - - [728, 8519.29] + - [834, 8519.29] - - [1024, 3533, 1, 4096] - - [719, 8495.77] + - [825, 8495.77] - - [1024, 4030, 1, 1024] - - [704, 9304.68] + - [810, 9304.68] - - [1024, 3311, 1, 4096] - - [719, 8278.6] + - [825, 8278.6] - - [1024, 3468, 1, 4096] - - [710, 8564.55] + - [816, 8564.55] - - [23, 23, 2720, 64] - - [734, 2311.55] + - [840, 2311.55] - - [4096, 3359, 1, 1024] - - [700, 9309.15] + - [806, 9309.15] - - [4096, 3392, 1, 1024] - - [700, 9388.19] + - [806, 9388.19] - - [1024, 3925, 1, 1024] - - [702, 9006.72] + - [808, 9006.72] - - [4096, 3233, 1, 1024] - - [698, 9603.64] + - [804, 9603.64] - - [4096, 3956, 1, 1024] - - [699, 9581.94] + - [805, 9581.94] - - [1024, 3463, 1, 4096] - - [720, 8293.97] + - [826, 8293.97] - - [1024, 3126, 1, 4096] - - [719, 7978.13] + - [825, 7978.13] - - [1024, 3363, 1, 4096] - - [712, 8267.47] + - [818, 8267.47] - - [4096, 3465, 1, 1024] - - [698, 9590.74] + - [804, 9590.74] - - [33708, 3996, 1, 1024] - - [699, 9899.99] + - [805, 9899.99] - - [1024, 3231, 1, 4096] - - [720, 8231.68] + - [826, 8231.68] - - [33708, 3978, 1, 1024] - - [699, 9853.64] + - [805, 9853.64] - - [4096, 3476, 1, 1024] - - [698, 9616.62] + - [804, 9616.62] - - [85, 85, 752, 64] - - [730, 4240.65] + - [836, 4240.65] - - [4096, 3339, 1, 1024] - - [700, 9249.81] + - [806, 9249.81] - - [4096, 3452, 1, 1024] - - [698, 9534.13] + - [804, 9534.13] - - [1024, 3396, 1, 4096] - - [719, 8451.23] + - [825, 8451.23] - - [4096, 3293, 1, 1024] - - [700, 9775.22] + - [806, 9775.22] - - [54, 54, 1184, 64] - - [732, 4153.54] + - [838, 4153.54] - - [1024, 3432, 1, 4096] - - [713, 8345.53] + - [819, 8345.53] - - [4096, 3493, 1, 1024] - - [701, 9649.9] + - [807, 9649.9] - - [4096, 3350, 1, 1024] - - [700, 9273.91] + - [806, 9273.91] - - [1024, 3079, 1, 4096] - - [728, 7775.66] + - [834, 7775.66] - - [1024, 3101, 1, 4096] - - [728, 7847.85] + - [834, 7847.85] - - [33708, 3939, 1, 1024] - - [701, 10054.4] + - [807, 10054.4] - - [4096, 3256, 1, 1024] - - [700, 9681.83] + - [806, 9681.83] - - [1024, 3439, 1, 4096] - - [719, 8531.11] + - [825, 8531.11] - - [1024, 3510, 1, 4096] - - [718, 8422.31] + - [824, 8422.31] - - [4096, 3900, 1, 1024] - - [699, 9468.61] + - [805, 9468.61] - - [1024, 3470, 1, 4096] - - [720, 8507.77] + - [826, 8507.77] - - [4096, 3456, 1, 1024] - - [700, 9577.46] + - [806, 9577.46] - - [4096, 3014, 1, 1024] - - [699, 9666.15] + - [805, 9666.15] - - [4096, 3367, 1, 1024] - - [701, 9328.36] + - [807, 9328.36] - - [4096, 3432, 1, 1024] - - [698, 9480.88] + - [804, 9480.88] - - [33708, 4026, 1, 1024] - - [701, 9972.83] + - [807, 9972.83] - - [4096, 3273, 1, 1024] - - [698, 9716.95] + - [804, 9716.95] - - [4096, 3130, 1, 1024] - - [698, 9311.4] + - [804, 9311.4] - - [1024, 3496, 1, 4096] - - [709, 8434.65] + - [815, 8434.65] - - [1024, 3995, 1, 4096] - - [694, 9157.73] + - [800, 9157.73] - - [1024, 3939, 1, 4096] - - [702, 9059.86] + - [808, 9059.86] - - [1024, 3121, 1, 4096] - - [726, 7963.43] + - [832, 7963.43] - - [1024, 3232, 1, 4096] - - [720, 8061.09] + - [826, 8061.09] - - [4096, 3147, 1, 1024] - - [700, 9364.63] + - [806, 9364.63] - - [4096, 3516, 1, 1024] - - [698, 9708.84] + - [804, 9708.84] - - [1024, 3969, 1, 1024] - - [704, 9168.68] + - [810, 9168.68] - - [1024, 3364, 1, 4096] - - [708, 8363.65] + - [814, 8363.65] - - [4096, 3411, 1, 1024] - - [701, 9442.77] + - [807, 9442.77] - - [147, 147, 432, 64] - - [745, 4843.21] + - [851, 4843.21] - - [4096, 3301, 1, 1024] - - [700, 9783.46] + - [806, 9783.46] - - [112, 111, 576, 64] - - [732, 5627.47] + - [838, 5627.47] - - [1024, 3513, 1, 4096] - - [719, 8725.41] + - [825, 8725.41] - - [1024, 3469, 1, 4096] - - [699, 8183.11] + - [805, 8183.11] - - [1024, 3095, 1, 4096] - - [720, 7887.87] + - [826, 7887.87] - - [4096, 3533, 1, 1024] - - [699, 9755.27] + - [805, 9755.27] - - [4096, 3390, 1, 1024] - - [698, 9377.21] + - [804, 9377.21] - - [4096, 3582, 1, 1024] - - [698, 9874.96] + - [804, 9874.96] - - [1024, 3956, 1, 1024] - - [704, 9058.82] + - [810, 9058.82] - - [4096, 3585, 1, 1024] - - [700, 9289.75] + - [806, 9289.75] - - [4096, 3231, 1, 1024] - - [699, 9597.15] + - [805, 9597.15] - - [1024, 3205, 1, 4096] - - [718, 8073.25] + - [824, 8073.25] - - [4096, 3496, 1, 1024] - - [699, 9668.38] + - [805, 9668.38] - - [1024, 3143, 1, 4096] - - [718, 8031.68] + - [824, 8031.68] - - [1024, 3318, 1, 4096] - - [715, 8261.43] + - [821, 8261.43] - - [1024, 3353, 1, 4096] - - [719, 8414.92] + - [825, 8414.92] - - [1024, 3464, 1, 4096] - - [718, 8310.03] + - [824, 8310.03] - - [4096, 2736, 1, 1024] - - [700, 9563.12] + - [806, 9563.12] - - [1024, 3402, 1, 4096] - - [715, 8413.84] + - [821, 8413.84] - - [4096, 3138, 1, 1024] - - [700, 9342.09] + - [806, 9342.09] - - [1024, 3860, 1, 4096] - - [703, 9008.57] + - [809, 9008.57] - - [148, 148, 432, 64] - - [745, 4915.7] + - [851, 4915.7] - - [1024, 3539, 1, 4096] - - [715, 8449.36] + - [821, 8449.36] - - [4096, 3211, 1, 1024] - - [700, 9551.28] + - [806, 9551.28] - - [1024, 3332, 1, 4096] - - [708, 8295.11] + - [814, 8295.11] - - [1024, 3466, 1, 4096] - - [719, 8339.25] + - [825, 8339.25] - - [4096, 3475, 1, 1024] - - [698, 9612.33] + - [804, 9612.33] - - [4096, 3524, 1, 1024] - - [701, 9722.74] + - [807, 9722.74] - - [4096, 2985, 1, 1024] - - [701, 9591.33] + - [807, 9591.33] - - [4096, 3222, 1, 1024] - - [698, 9577.48] + - [804, 9577.48] - - [4096, 3451, 1, 1024] - - [700, 9541.42] + - [806, 9541.42] - - [1024, 3181, 1, 4096] - - [718, 8118.89] + - [824, 8118.89] - - [1024, 3640, 1, 4096] - - [703, 8617.11] + - [809, 8617.11] - - [1024, 3375, 1, 4096] - - [707, 8419.75] + - [813, 8419.75] - - [1024, 3550, 1, 4096] - - [720, 8512.83] + - [826, 8512.83] - - [1024, 4020, 1, 1024] - - [704, 9266.9] + - [810, 9266.9] - - [1024, 3840, 1, 4096] - - [703, 8983.49] + - [809, 8983.49] - - [4096, 3349, 1, 1024] - - [698, 9279.96] + - [804, 9279.96] - - [4096, 3398, 1, 1024] - - [699, 9402.32] + - [805, 9402.32] - - [33708, 3976, 1, 1024] - - [700, 9849.54] + - [806, 9849.54] - - [1024, 2917, 1, 4096] - - [705, 8936.87] + - [811, 8936.87] - - [33708, 3910, 1, 1024] - - [698, 9983.35] + - [804, 9983.35] - - [4096, 3860, 1, 1024] - - [699, 9377.58] + - [805, 9377.58] - - [4096, 3304, 1, 1024] - - [701, 9798.44] + - [807, 9798.44] - - [1024, 3286, 1, 4096] - - [706, 8167.41] + - [812, 8167.41] - - [1024, 3460, 1, 4096] - - [716, 8539.56] + - [822, 8539.56] - - [1024, 4026, 1, 4096] - - [702, 9305.68] + - [808, 9305.68] - - [4096, 3471, 1, 1024] - - [700, 9596.71] + - [806, 9596.71] - - [193, 193, 320, 64] - - [748, 4758.46] + - [854, 4758.46] - - [1024, 3894, 1, 1024] - - [702, 8979.6] + - [808, 8979.6] - - [65, 65, 992, 64] - - [744, 2565.49] + - [850, 2565.49] - - [1024, 3506, 1, 4096] - - [716, 8593.22] + - [822, 8593.22] - - [35, 35, 1808, 64] - - [738, 2129.72] + - [844, 2129.72] - - [1024, 4000, 1, 1024] - - [702, 9204.6] + - [808, 9204.6] - - [1024, 3900, 1, 4096] - - [698, 9050.36] + - [804, 9050.36] - - [1024, 3445, 1, 4096] - - [721, 8551.65] + - [827, 8551.65] - - [4096, 3442, 1, 1024] - - [699, 9505.0] + - [805, 9505.0] - - [1024, 3358, 1, 4096] - - [720, 8437.16] + - [826, 8437.16] - - [13, 13, 4672, 64] - - [731, 860.665] + - [837, 860.665] - - [1024, 3211, 1, 4096] - - [724, 8085.25] + - [830, 8085.25] - - [4096, 3515, 1, 1024] - - [700, 9715.29] + - [806, 9715.29] - - [1024, 3564, 1, 4096] - - [706, 8760.37] + - [812, 8760.37] - - [4096, 3057, 1, 1024] - - [700, 9804.05] + - [806, 9804.05] - - [1024, 3343, 1, 4096] - - [718, 8363.8] + - [824, 8363.8] - - [4096, 3262, 1, 1024] - - [699, 9686.49] + - [805, 9686.49] - - [1024, 3518, 1, 4096] - - [718, 8455.05] + - [824, 8455.05] - - [77, 77, 816, 64] - - [737, 3505.94] + - [843, 3505.94] - - [33708, 3876, 1, 1024] - - [699, 9895.95] + - [805, 9895.95] - - [4096, 3462, 1, 1024] - - [700, 9570.31] + - [806, 9570.31] - - [1024, 3265, 1, 4096] - - [718, 8322.75] + - [824, 8322.75] - - [4096, 3389, 1, 1024] - - [699, 9382.86] + - [805, 9382.86] - - [4096, 3438, 1, 1024] - - [700, 9503.47] + - [806, 9503.47] - - [1024, 3955, 1, 1024] - - [702, 9064.45] + - [808, 9064.45] - - [1024, 3545, 1, 4096] - - [721, 8652.41] + - [827, 8652.41] - - [1024, 3144, 1, 4096] - - [721, 8060.55] + - [827, 8060.55] - - [1024, 3417, 1, 4096] - - [719, 8505.91] + - [825, 8505.91] - - [4096, 3543, 1, 1024] - - [698, 9775.67] + - [804, 9775.67] - - [4096, 3352, 1, 1024] - - [700, 9282.87] + - [806, 9282.87] - - [33708, 3975, 1, 1024] - - [701, 9849.49] + - [807, 9849.49] - - [148, 147, 432, 64] - - [745, 4876.15] + - [851, 4876.15] - - [4096, 3137, 1, 1024] - - [698, 9330.63] + - [804, 9330.63] - - [4096, 3506, 1, 1024] - - [701, 9682.76] + - [807, 9682.76] - - [1024, 3975, 1, 1024] - - [704, 9164.77] + - [810, 9164.77] - - [1024, 3859, 1, 4096] - - [702, 8983.84] + - [808, 8983.84] - - [4096, 3369, 1, 1024] - - [700, 9330.45] + - [806, 9330.45] - - [1024, 3434, 1, 4096] - - [718, 8486.98] + - [824, 8486.98] - - [1024, 3292, 1, 4096] - - [718, 8478.96] + - [824, 8478.96] - - [4096, 3523, 1, 1024] - - [698, 9734.83] + - [804, 9734.83] - - [4096, 3380, 1, 1024] - - [700, 9354.49] + - [806, 9354.49] - - [1024, 3408, 1, 4096] - - [721, 8441.03] + - [827, 8441.03] - - [4096, 3221, 1, 1024] - - [700, 9575.59] + - [806, 9575.59] - - [4096, 3270, 1, 1024] - - [700, 9717.95] + - [806, 9717.95] - - [143, 143, 432, 64] - - [746, 4643.45] + - [852, 4643.45] - - [111, 111, 576, 64] - - [738, 5475.04] + - [844, 5475.04] - - [1024, 3303, 1, 4096] - - [720, 8413.07] + - [826, 8413.07] - - [4096, 3502, 1, 1024] - - [700, 9679.87] + - [806, 9679.87] - - [1024, 3222, 1, 4096] - - [720, 8141.88] + - [826, 8141.88] - - [4096, 2505, 1, 1024] - - [698, 9594.95] + - [804, 9594.95] - - [4096, 3397, 1, 1024] - - [698, 9392.61] + - [804, 9392.61] - - [4096, 3562, 1, 1024] - - [698, 9827.58] + - [804, 9827.58] - - [4096, 3095, 1, 1024] - - [700, 9222.45] + - [806, 9222.45] - - [1024, 3226, 1, 4096] - - [716, 8027.03] + - [822, 8027.03] - - [177, 177, 352, 64] - - [733, 6406.96] + - [839, 6406.96] - - [4096, 3360, 1, 1024] - - [699, 9298.15] + - [805, 9298.15] - - [1024, 3942, 1, 1024] - - [704, 9061.59] + - [810, 9061.59] - - [1024, 3298, 1, 4096] - - [721, 8254.36] + - [827, 8254.36] - - [1024, 3381, 1, 4096] - - [720, 8508.81] + - [826, 8508.81] - - [4096, 3314, 1, 1024] - - [700, 9837.56] + - [806, 9837.56] - - [1024, 3492, 1, 4096] - - [708, 8583.39] + - [814, 8583.39] - - [1024, 3430, 1, 4096] - - [708, 8492.71] + - [814, 8492.71] - - [4096, 3977, 1, 1024] - - [700, 9656.45] + - [806, 9656.45] - - [4096, 3546, 1, 1024] - - [698, 9780.35] + - [804, 9780.35] - - [4096, 3640, 1, 1024] - - [698, 9415.51] + - [804, 9415.51] - - [4096, 3441, 1, 1024] - - [699, 9499.24] + - [805, 9499.24] - - [33708, 4059, 1, 1024] - - [701, 10051.9] + - [807, 10051.9] - - [1024, 3978, 1, 1024] - - [702, 9158.8] + - [808, 9158.8] - - [1024, 3376, 1, 4096] - - [720, 8415.44] + - [826, 8415.44] - - [1024, 3482, 1, 4096] - - [721, 8396.62] + - [827, 8396.62] - - [1024, 3563, 1, 4096] - - [704, 8424.18] + - [810, 8424.18] - - [4096, 4020, 1, 1024] - - [701, 9745.96] + - [807, 9745.96] - - [1024, 3271, 1, 4096] - - [719, 8289.68] + - [825, 8289.68] - - [1024, 3291, 1, 4096] - - [719, 8222.71] + - [825, 8222.71] - - [1024, 3431, 1, 4096] - - [714, 8464.4] + - [820, 8464.4] - - [1024, 3481, 1, 4096] - - [720, 8386.5] + - [826, 8386.5] - - [84, 85, 752, 64] - - [735, 4194.85] + - [841, 4194.85] - - [4096, 3461, 1, 1024] - - [698, 9579.67] + - [804, 9579.67] - - [1024, 3574, 1, 4096] - - [721, 8579.8] + - [827, 8579.8] - - [1024, 4059, 1, 1024] - - [702, 9330.54] + - [808, 9330.54] - - [84, 84, 752, 64] - - [742, 4141.46] + - [848, 4141.46] - - [1024, 3421, 1, 4096] - - [721, 8528.42] + - [827, 8528.42] - - [4096, 3224, 1, 1024] - - [700, 9589.95] + - [806, 9589.95] - - [4096, 3437, 1, 1024] - - [700, 9498.2] + - [806, 9498.2] - - [45, 45, 1424, 64] - - [732, 3314.58] + - [838, 3314.58] - - [4096, 3840, 1, 1024] - - [698, 9931.37] + - [804, 9931.37] - - [4096, 3168, 1, 1024] - - [700, 9412.16] + - [806, 9412.16] - - [33708, 3990, 1, 1024] - - [698, 9884.39] + - [804, 9884.39] - - [1024, 3349, 1, 4096] - - [720, 8421.4] + - [826, 8421.4] - - [4096, 3335, 1, 1024] - - [698, 9241.65] + - [804, 9241.65] - - [4096, 3400, 1, 1024] - - [700, 9407.35] + - [806, 9407.35] - - [160, 159, 400, 64] - - [747, 5708.94] + - [853, 5708.94] - - [1024, 3398, 1, 4096] - - [720, 8624.03] + - [826, 8624.03] - - [1024, 3780, 1, 4096] - - [700, 8756.78] + - [806, 8756.78] - - [29, 29, 2176, 64] - - [743, 2963.69] + - [849, 2963.69] - - [4096, 3098, 1, 1024] - - [698, 9229.82] + - [804, 9229.82] - - [1024, 4012, 1, 4096] - - [704, 9422.03] + - [810, 9422.03] - - [4096, 3505, 1, 1024] - - [700, 9687.65] + - [806, 9687.65] - - [4096, 3554, 1, 1024] - - [700, 9812.22] + - [806, 9812.22] - - [4096, 3063, 1, 1024] - - [700, 9825.1] + - [806, 9825.1] - - [1024, 3503, 1, 4096] - - [718, 8404.74] + - [824, 8404.74] - - [1024, 3166, 1, 4096] - - [721, 8084.93] + - [827, 8084.93] - - [1024, 3425, 1, 4096] - - [721, 8537.58] + - [827, 8537.58] - - [1024, 3344, 1, 4096] - - [712, 8351.16] + - [818, 8351.16] - - [4096, 3484, 1, 1024] - - [700, 9635.7] + - [806, 9635.7] - - [1024, 3681, 1, 1024] - - [703, 8457.18] + - [809, 8457.18] - - [1024, 4050, 1, 1024] - - [704, 9326.21] + - [810, 9326.21] - - [4096, 3379, 1, 1024] - - [698, 9356.16] + - [804, 9356.16] - - [4096, 3428, 1, 1024] - - [699, 9472.33] + - [805, 9472.33] - - [12, 12, 5040, 64] - - [737, 741.617] + - [843, 741.617] - - [27, 27, 2336, 64] - - [743, 2757.9] + - [849, 2757.9] - - [1024, 3304, 1, 4096] - - [721, 8317.82] + - [827, 8317.82] - - [1024, 3387, 1, 4096] - - [719, 8460.15] + - [825, 8460.15] - - [4096, 3126, 1, 1024] - - [701, 9308.48] + - [807, 9308.48] - - [1024, 3498, 1, 4096] - - [718, 8485.55] + - [824, 8485.55] - - [1024, 3436, 1, 4096] - - [720, 8397.71] + - [826, 8397.71] - - [4096, 3501, 1, 1024] - - [698, 9681.19] + - [804, 9681.19] - - [4096, 3358, 1, 1024] - - [700, 9304.9] + - [806, 9304.9] - - [4096, 3232, 1, 1024] - - [698, 9607.2] + - [804, 9607.2] - - [1024, 3585, 1, 4096] - - [702, 8510.74] + - [808, 8510.74] - - [4096, 3143, 1, 1024] - - [701, 9355.91] + - [807, 9355.91] - - [4096, 3464, 1, 1024] - - [700, 9585.95] + - [806, 9585.95] - - [1024, 3366, 1, 4096] - - [708, 8275.23] + - [814, 8275.23] - - [4096, 3375, 1, 1024] - - [698, 9342.13] + - [804, 9342.13] - - [4096, 2917, 1, 1024] - - [698, 9372.84] + - [804, 9372.84] - - [4096, 4026, 1, 1024] - - [700, 9759.15] + - [806, 9759.15] - - [49, 49, 1296, 64] - - [739, 3710.02] + - [845, 3710.02] - - [1024, 3277, 1, 4096] - - [719, 8217.1] + - [825, 8217.1] - - [1024, 3103, 1, 4096] - - [720, 7872.67] + - [826, 7872.67] - - [33708, 3995, 1, 1024] - - [700, 9893.08] + - [806, 9893.08] - - [1024, 3297, 1, 4096] - - [719, 8185.82] + - [825, 8185.82] - - [4096, 3545, 1, 1024] - - [700, 9789.43] + - [806, 9789.43] - - [1024, 3399, 1, 4096] - - [719, 8377.18] + - [825, 8377.18] - - [33708, 3796, 1, 1024] - - [699, 10008.0] + - [805, 10008.0] - - [4096, 3292, 1, 1024] - - [700, 9767.28] + - [806, 9767.28] - - [71, 71, 896, 64] - - [734, 3006.25] + - [840, 3006.25] - - [33708, 3859, 1, 1024] - - [701, 9860.37] + - [807, 9860.37] - - [4096, 3566, 1, 1024] - - [700, 9834.47] + - [806, 9834.47] - - [4096, 3894, 1, 1024] - - [698, 9456.67] + - [804, 9456.67] - - [4096, 3492, 1, 1024] - - [698, 9653.24] + - [804, 9653.24] - - [1024, 3977, 1, 1024] - - [704, 9161.33] + - [810, 9161.33] - - [1024, 3272, 1, 4096] - - [721, 8257.09] + - [827, 8257.09] - - [135, 134, 480, 64] - - [745, 4238.39] + - [851, 4238.39] - - [1024, 3355, 1, 4096] - - [719, 8374.64] + - [825, 8374.64] - - [4096, 3419, 1, 1024] - - [701, 9455.44] + - [807, 9455.44] - - [1024, 3404, 1, 4096] - - [720, 8580.28] + - [826, 8580.28] - - [4096, 3999, 1, 1024] - - [700, 9701.78] + - [806, 9701.78] - - [4096, 3166, 1, 1024] - - [698, 9410.48] + - [804, 9410.48] - - [33708, 3840, 1, 1024] - - [701, 10132.9] + - [807, 10132.9] - - [4096, 4032, 1, 1024] - - [701, 9762.86] + - [807, 9762.86] - - [1024, 3573, 1, 4096] - - [719, 8603.4] + - [825, 8603.4] - - [4096, 3366, 1, 1024] - - [701, 9322.63] + - [807, 9322.63] - - [1024, 3541, 1, 4096] - - [721, 8405.9] + - [827, 8405.9] - - [4096, 3207, 1, 1024] - - [698, 9544.25] + - [804, 9544.25] - - [4096, 3272, 1, 1024] - - [700, 9716.73] + - [806, 9716.73] - - [1024, 3334, 1, 4096] - - [718, 8241.39] + - [824, 8241.39] - - [228, 228, 272, 64] - - [733, 6232.45] + - [839, 6232.45] - - [4096, 3183, 1, 1024] - - [700, 9452.44] + - [806, 9452.44] - - [4096, 3536, 1, 1024] - - [699, 9759.44] + - [805, 9759.44] - - [1024, 4005, 1, 1024] - - [703, 9225.83] + - [809, 9225.83] - - [1024, 3245, 1, 4096] - - [720, 8074.31] + - [826, 8074.31] - - [4096, 3447, 1, 1024] - - [699, 9525.84] + - [805, 9525.84] - - [1024, 3183, 1, 4096] - - [719, 8121.62] + - [825, 8121.62] - - [1024, 3361, 1, 4096] - - [721, 8285.86] + - [827, 8285.86] - - [33708, 3870, 1, 1024] - - [699, 9879.35] + - [805, 9879.35] - - [1024, 3321, 1, 4096] - - [720, 8408.67] + - [826, 8408.67] - - [1024, 3968, 1, 1024] - - [702, 9202.05] + - [808, 9202.05] - - [1024, 3486, 1, 4096] - - [716, 8258.89] + - [822, 8258.89] - - [4096, 4005, 1, 1024] - - [700, 9723.98] + - [806, 9723.98] - - [4096, 3410, 1, 1024] - - [701, 9440.5] + - [807, 9440.5] - - [1024, 3944, 1, 1024] - - [704, 9040.82] + - [810, 9040.82] - - [4096, 3300, 1, 1024] - - [699, 9789.9] + - [805, 9789.9] - - [4096, 3579, 1, 1024] - - [701, 9859.44] + - [807, 9859.44] - - [4096, 3483, 1, 1024] - - [701, 9624.31] + - [807, 9624.31] - - [4096, 3532, 1, 1024] - - [700, 9742.76] + - [806, 9742.76] - - [1024, 3140, 1, 4096] - - [720, 7899.65] + - [826, 7899.65] - - [1024, 3372, 1, 4096] - - [718, 8237.07] + - [824, 8237.07] - - [1024, 3224, 1, 4096] - - [721, 8159.13] + - [827, 8159.13] - - [4096, 3230, 1, 1024] - - [700, 9601.25] + - [806, 9601.25] - - [4096, 3427, 1, 1024] - - [700, 9466.57] + - [806, 9466.57] - - [1024, 3796, 1, 1024] - - [704, 8739.78] + - [810, 8739.78] - - [143, 148, 432, 64] - - [745, 4762.0] + - [851, 4762.0] - - [1024, 3616, 1, 4096] - - [703, 8445.89] + - [809, 8445.89] - - [1024, 3315, 1, 4096] - - [720, 8403.21] + - [826, 8403.21] - - [1024, 3476, 1, 4096] - - [718, 8523.68] + - [824, 8523.68] - - [1024, 3509, 1, 4096] - - [718, 8345.05] + - [824, 8345.05] - - [4096, 3357, 1, 1024] - - [700, 9300.16] + - [806, 9300.16] - - [4096, 3406, 1, 1024] - - [700, 9427.44] + - [806, 9427.44] - - [1024, 3558, 1, 4096] - - [719, 8525.78] + - [825, 8525.78] - - [4096, 3593, 1, 1024] - - [700, 9302.2] + - [806, 9302.2] - - [4096, 3247, 1, 1024] - - [700, 9648.5] + - [806, 9648.5] - - [4096, 3088, 1, 1024] - - [700, 9204.21] + - [806, 9204.21] - - [1024, 3213, 1, 4096] - - [718, 8054.31] + - [824, 8054.31] - - [4096, 3511, 1, 1024] - - [698, 9702.7] + - [804, 9702.7] - - [122, 122, 528, 64] - - [739, 6293.39] + - [845, 6293.39] - - [1024, 3365, 1, 4096] - - [715, 8413.62] + - [821, 8413.62] - - [1024, 3504, 1, 4096] - - [717, 8414.46] + - [823, 8414.46] - - [1024, 3442, 1, 4096] - - [720, 8684.0] + - [826, 8684.0] - - [4096, 3474, 1, 1024] - - [698, 9611.6] + - [804, 9611.6] - - [4096, 2984, 1, 1024] - - [699, 9592.82] + - [805, 9592.82] - - [1024, 3876, 1, 4096] - - [702, 9085.95] + - [808, 9085.95] - - [4096, 3337, 1, 1024] - - [700, 9246.22] + - [806, 9246.22] - - [4096, 3450, 1, 1024] - - [700, 9534.63] + - [806, 9534.63] - - [1024, 3547, 1, 4096] - - [720, 8386.73] + - [826, 8386.73] - - [4096, 3291, 1, 1024] - - [699, 9759.34] + - [805, 9759.34] - - [1024, 3340, 1, 4096] - - [719, 8237.97] + - [825, 8237.97] - - [4096, 3491, 1, 1024] - - [700, 9656.59] + - [806, 9656.59] - - [4096, 3348, 1, 1024] - - [700, 9279.15] + - [806, 9279.15] - - [78, 78, 816, 64] - - [740, 3591.09] + - [846, 3591.09] - - [4096, 3968, 1, 1024] - - [701, 9642.19] + - [807, 9642.19] - - [4096, 3906, 1, 1024] - - [701, 9485.37] + - [807, 9485.37] - - [1024, 3477, 1, 4096] - - [708, 8389.2] + - [814, 8389.2] - - [1024, 3397, 1, 4096] - - [718, 8556.88] + - [824, 8556.88] - - [4096, 3165, 1, 1024] - - [699, 9415.52] + - [805, 9415.52] - - [4096, 3470, 1, 1024] - - [698, 9598.5] + - [804, 9598.5] - - [1024, 3526, 1, 4096] - - [718, 8442.15] + - [824, 8442.15] - - [112, 112, 576, 64] - - [733, 5672.6] + - [839, 5672.6] - - [4096, 3365, 1, 1024] - - [698, 9321.83] + - [804, 9321.83] - - [4096, 3319, 1, 1024] - - [698, 9838.48] + - [804, 9838.48] - - [1024, 3401, 1, 4096] - - [720, 8460.86] + - [826, 8460.86] - - [1024, 3294, 1, 4096] - - [719, 8324.63] + - [825, 8324.63] - - [159, 159, 400, 64] - - [735, 5488.51] + - [841, 5488.51] - - [1024, 3472, 1, 4096] - - [713, 8289.77] + - [819, 8289.77] - - [4096, 3328, 1, 1024] - - [699, 9904.35] + - [805, 9904.35] - - [1024, 3861, 1, 1024] - - [704, 8917.63] + - [810, 8917.63] - - [1024, 3910, 1, 1024] - - [702, 9010.16] + - [808, 9010.16] - - [1024, 3410, 1, 4096] - - [720, 8519.63] + - [826, 8519.63] - - [1024, 3395, 1, 4096] - - [718, 8424.35] + - [824, 8424.35] - - [4096, 3282, 1, 1024] - - [698, 9743.67] + - [804, 9743.67] - - [1024, 3751, 1, 1024] - - [705, 8680.39] + - [811, 8680.39] - - [4096, 3145, 1, 1024] - - [700, 9353.37] + - [806, 9353.37] - - [4096, 3514, 1, 1024] - - [700, 9713.04] + - [806, 9713.04] - - [4096, 3944, 1, 1024] - - [700, 9563.92] + - [806, 9563.92] - - [1024, 3515, 1, 4096] - - [719, 8428.13] + - [825, 8428.13] - - [4096, 3409, 1, 1024] - - [699, 9428.77] + - [805, 9428.77] - - [4096, 3564, 1, 1024] - - [698, 9823.79] + - [804, 9823.79] - - [4096, 3299, 1, 1024] - - [700, 9793.03] + - [806, 9793.03] - - [1024, 3057, 1, 4096] - - [696, 9237.85] + - [802, 9237.85] - - [4096, 3531, 1, 1024] - - [698, 9745.64] + - [804, 9745.64] - - [4096, 3388, 1, 1024] - - [700, 9374.65] + - [806, 9374.65] - - [1024, 3189, 1, 4096] - - [720, 8084.6] + - [826, 8084.6] - - [1024, 3300, 1, 4096] - - [720, 8185.13] + - [826, 8185.13] - - [1024, 3720, 1, 4096] - - [699, 8755.11] + - [805, 8755.11] - - [1024, 3383, 1, 4096] - - [713, 8463.47] + - [819, 8463.47] - - [1024, 3494, 1, 4096] - - [720, 8676.57] + - [826, 8676.57] - - [77, 78, 816, 64] - - [736, 3548.26] + - [842, 3548.26] - - [1024, 3448, 1, 4096] - - [718, 8665.78] + - [824, 8665.78] - - [4096, 3542, 1, 1024] - - [698, 9771.88] + - [804, 9771.88] - - [1024, 3488, 1, 4096] - - [718, 8488.39] + - [824, 8488.39] - - [4096, 3405, 1, 1024] - - [700, 9426.16] + - [806, 9426.16] - - [1024, 3262, 1, 4096] - - [720, 8206.97] + - [826, 8206.97] - - [33708, 4005, 1, 1024] - - [701, 9928.16] + - [807, 9928.16] - - [1024, 3594, 1, 4096] - - [705, 8458.57] + - [811, 8458.57] - - [4096, 3103, 1, 1024] - - [701, 9243.14] + - [807, 9243.14] - - [4096, 3136, 1, 1024] - - [700, 9340.9] + - [806, 9340.9] - - [1024, 3378, 1, 4096] - - [721, 8432.45] + - [827, 8432.45] - - [10, 10, 5952, 64] - - [741, 523.353] + - [847, 523.353] - - [7, 7, 8192, 64] - - [741, 260.543] + - [847, 260.543] - - [4096, 3559, 1, 1024] - - [700, 9813.1] + - [806, 9813.1] - - [4096, 3368, 1, 1024] - - [701, 9328.66] + - [807, 9328.66] - - [4096, 3209, 1, 1024] - - [698, 9538.83] + - [804, 9538.83] - - [4096, 3322, 1, 1024] - - [700, 9839.58] + - [806, 9839.58] - - [1024, 3483, 1, 4096] - - [706, 8348.35] + - [812, 8348.35] - - [4096, 3473, 1, 1024] - - [699, 9605.79] + - [805, 9605.79] - - [4096, 3522, 1, 1024] - - [701, 9730.02] + - [807, 9730.02] - - [1024, 3532, 1, 4096] - - [719, 8474.32] + - [825, 8474.32] - - [4096, 3449, 1, 1024] - - [700, 9528.35] + - [806, 9528.35] - - [1024, 3351, 1, 4096] - - [721, 8311.23] + - [827, 8311.23] - - [1024, 3462, 1, 4096] - - [718, 8297.64] + - [824, 8297.64] - - [4096, 3396, 1, 1024] - - [700, 9400.25] + - [806, 9400.25] - - [132, 132, 480, 64] - - [746, 4089.84] + - [852, 4089.84] - - [111, 112, 576, 64] - - [732, 5529.7] + - [838, 5529.7] - - [1024, 3416, 1, 4096] - - [719, 8556.64] + - [825, 8556.64] - - [4096, 3469, 1, 1024] - - [701, 9598.77] + - [807, 9598.77] - - [1024, 3582, 1, 4096] - - [702, 8461.47] + - [808, 8461.47] - - [1024, 3230, 1, 4096] - - [719, 8188.94] + - [825, 8188.94] - - [1024, 3489, 1, 4096] - - [720, 8457.85] + - [826, 8457.85] - - [1024, 3427, 1, 4096] - - [720, 8566.59] + - [826, 8566.59] - - [1024, 3346, 1, 4096] - - [719, 8352.17] + - [825, 8352.17] - - [33708, 3977, 1, 1024] - - [701, 9868.5] + - [807, 9868.5] - - [4096, 3796, 1, 1024] - - [700, 9797.76] + - [806, 9797.76] - - [4096, 3176, 1, 1024] - - [700, 9435.39] + - [806, 9435.39] - - [4096, 3990, 1, 1024] - - [698, 9672.33] + - [804, 9672.33] - - [1024, 3257, 1, 4096] - - [721, 8225.17] + - [827, 8225.17] - - [4096, 3343, 1, 1024] - - [722, 9273.62] + - [828, 9273.62] - - [4096, 3440, 1, 1024] - - [698, 9501.48] + - [804, 9501.48] - - [33708, 4030, 1, 1024] - - [699, 9983.36] + - [805, 9983.36] - - [1024, 3190, 1, 4096] - - [720, 8192.11] + - [826, 8192.11] - - [1024, 3389, 1, 4096] - - [721, 8439.42] + - [827, 8439.42] - - [1024, 3500, 1, 4096] - - [719, 8556.12] + - [825, 8556.12] - - [1024, 3471, 1, 4096] - - [708, 8491.17] + - [814, 8491.17] - - [1024, 3438, 1, 4096] - - [721, 8567.95] + - [827, 8567.95] - - [4096, 3513, 1, 1024] - - [698, 9710.27] + - [804, 9710.27] - - [1024, 3562, 1, 4096] - - [713, 8608.94] + - [819, 8608.94] - - [4096, 3616, 1, 1024] - - [700, 9357.59] + - [806, 9357.59] - - [4096, 3955, 1, 1024] - - [699, 9589.71] + - [805, 9589.71] - - [1024, 3441, 1, 4096] - - [709, 8359.27] + - [815, 8359.27] - - [1024, 3236, 1, 4096] - - [723, 8022.6] + - [829, 8022.6] - - [1024, 3524, 1, 4096] - - [718, 8477.24] + - [824, 8477.24] - - [4096, 3460, 1, 1024] - - [698, 9581.96] + - [804, 9581.96] - - [16, 16, 3840, 64] - - [730, 1270.59] + - [836, 1270.59] - - [92, 93, 688, 64] - - [734, 4962.4] + - [840, 4962.4] - - [1024, 3384, 1, 4096] - - [709, 8409.39] + - [815, 8409.39] - - [4096, 3387, 1, 1024] - - [700, 9379.8] + - [806, 9379.8] - - [4096, 3436, 1, 1024] - - [698, 9491.93] + - [804, 9491.93] - - [4096, 3277, 1, 1024] - - [698, 9717.27] + - [804, 9717.27] - - [1024, 3457, 1, 4096] - - [718, 8279.22] + - [824, 8279.22] - - [1024, 3999, 1, 4096] - - [693, 9231.47] + - [799, 9231.47] - - [1024, 4032, 1, 4096] - - [702, 9443.62] + - [808, 9443.62] - - [4096, 3541, 1, 1024] - - [698, 9773.24] + - [804, 9773.24] - - [4096, 3334, 1, 1024] - - [698, 9242.79] + - [804, 9242.79] - - [1024, 3393, 1, 4096] - - [720, 8376.17] + - [826, 8376.17] - - [17, 17, 3632, 64] - - [742, 1425.77] + - [848, 1425.77] - - [1024, 3411, 1, 4096] - - [708, 8490.97] + - [814, 8490.97] - - [1024, 3822, 1, 1024] - - [705, 8773.44] + - [811, 8773.44] - - [1024, 3593, 1, 4096] - - [705, 8571.25] + - [811, 8571.25] - - [33708, 3822, 1, 1024] - - [699, 10056.8] + - [805, 10056.8] - - [4096, 3504, 1, 1024] - - [701, 9680.29] + - [807, 9680.29] - - [1024, 3163, 1, 4096] - - [720, 8014.43] + - [826, 8014.43] - - [1024, 3357, 1, 4096] - - [721, 8376.04] + - [827, 8376.04] - - [1024, 3906, 1, 4096] - - [702, 9108.22] + - [808, 9108.22] - - [4096, 3415, 1, 1024] - - [698, 9443.87] + - [804, 9443.87] - - [1024, 3406, 1, 4096] - - [721, 8451.64] + - [827, 8451.64] - - [4096, 3321, 1, 1024] - - [700, 9836.62] + - [806, 9836.62] - - [4096, 3584, 1, 1024] - - [701, 9915.93] + - [807, 9915.93] - - [1024, 2736, 1, 4096] - - [704, 8532.93] + - [810, 8532.93] - - [1024, 3110, 1, 4096] - - [721, 7889.29] + - [827, 7889.29] - - [33708, 3999, 1, 1024] - - [701, 9903.33] + - [807, 9903.33] - - [1024, 3093, 1, 4096] - - [719, 7919.35] + - [825, 7919.35] - - [4096, 3378, 1, 1024] - - [701, 9362.3] + - [807, 9362.3] - - [1024, 3543, 1, 4096] - - [715, 8438.16] + - [821, 8438.16] - - [33708, 3925, 1, 1024] - - [700, 10021.6] + - [806, 10021.6] - - [1024, 3352, 1, 4096] - - [721, 8333.82] + - [827, 8333.82] - - [4096, 3780, 1, 1024] - - [698, 9755.02] + - [804, 9755.02] - - [1024, 3990, 1, 4096] - - [695, 9251.02] + - [801, 9251.02] - - [4096, 3500, 1, 1024] - - [698, 9673.83] + - [804, 9673.83] - - [4096, 3996, 1, 1024] - - [699, 9694.5] + - [805, 9694.5] - - [1024, 3247, 1, 4096] - - [724, 8171.58] + - [830, 8171.58] - - [4096, 3395, 1, 1024] - - [700, 9392.04] + - [806, 9392.04] - - [1024, 3169, 1, 4096] - - [719, 7990.24] + - [825, 7990.24] - - [1024, 3088, 1, 4096] - - [719, 7890.36] + - [825, 7890.36] - - [1024, 3584, 1, 4096] - - [721, 8604.2] + - [827, 8604.2] - - [4096, 3093, 1, 1024] - - [700, 9224.88] + - [806, 9224.88] - - [1024, 3538, 1, 4096] - - [702, 8395.74] + - [808, 8395.74] - - [1024, 3996, 1, 1024] - - [703, 9208.33] + - [809, 9208.33] - - [1024, 3581, 1, 4096] - - [715, 8523.24] + - [821, 8523.24] - - [4096, 3374, 1, 1024] - - [700, 9342.81] + - [806, 9342.81] - - [33708, 3751, 1, 1024] - - [700, 9881.99] + - [806, 9881.99] - - [59, 59, 1088, 64] - - [738, 4515.54] + - [844, 4515.54] - - [4096, 3215, 1, 1024] - - [700, 9557.75] + - [806, 9557.75] - - [4096, 3312, 1, 1024] - - [698, 9834.4] + - [804, 9834.4] - - [4096, 3581, 1, 1024] - - [700, 9856.66] + - [806, 9856.66] - - [4096, 3479, 1, 1024] - - [700, 9620.35] + - [806, 9620.35] - - [4096, 3544, 1, 1024] - - [698, 9778.94] + - [804, 9778.94] - - [1024, 3870, 1, 1024] - - [703, 8935.26] + - [809, 8935.26] - - [1024, 3374, 1, 4096] - - [720, 8412.85] + - [826, 8412.85] - - [1024, 2967, 1, 4096] - - [703, 8982.97] + - [809, 8982.97] - - [41, 41, 1552, 64] - - [732, 2805.38] + - [838, 2805.38] - - [4096, 3455, 1, 1024] - - [698, 9538.89] + - [804, 9538.89] - - [4096, 3942, 1, 1024] - - [699, 9554.65] + - [805, 9554.65] - - [1024, 3528, 1, 4096] - - [718, 8438.47] + - [824, 8438.47] - - [4096, 3186, 1, 1024] - - [699, 9468.32] + - [805, 9468.32] - - [1024, 3976, 1, 1024] - - [703, 9167.08] + - [809, 9167.08] - - [1024, 3511, 1, 4096] - - [705, 8335.06] + - [811, 8335.06] - - [4096, 3573, 1, 1024] - - [698, 9855.33] + - [804, 9855.33] - - [4096, 3561, 1, 1024] - - [698, 9831.03] + - [804, 9831.03] - - [4096, 3418, 1, 1024] - - [699, 9450.68] + - [805, 9450.68] - - [33708, 3906, 1, 1024] - - [701, 9973.67] + - [807, 9973.67] - - [4096, 3259, 1, 1024] - - [698, 9685.26] + - [804, 9685.26] - - [4096, 3308, 1, 1024] - - [700, 9792.03] + - [806, 9792.03] - - [1024, 3419, 1, 4096] - - [720, 8514.53] + - [826, 8514.53] - - [1024, 3215, 1, 4096] - - [719, 8137.53] + - [825, 8137.53] - - [1024, 4030, 1, 4096] - - [701, 9290.76] + - [807, 9290.76] - - [4096, 3459, 1, 1024] - - [698, 9567.57] + - [804, 9567.57] - - [1024, 3572, 1, 4096] - - [718, 8501.43] + - [824, 8501.43] - - [1024, 3137, 1, 4096] - - [720, 7930.15] + - [826, 7930.15] - - [1024, 3312, 1, 4096] - - [721, 8378.6] + - [827, 8378.6] - - [1024, 3925, 1, 4096] - - [703, 9255.86] + - [809, 9255.86] - - [1024, 3453, 1, 4096] - - [720, 8630.76] + - [826, 8630.76] - - [4096, 3435, 1, 1024] - - [699, 9495.18] + - [805, 9495.18] - - [1024, 3176, 1, 4096] - - [720, 8087.23] + - [826, 8087.23] - - [1024, 3444, 1, 4096] - - [712, 8528.58] + - [818, 8528.58] - - [4096, 3975, 1, 1024] - - [701, 9645.34] + - [807, 9645.34] - - [4096, 3182, 1, 1024] - - [700, 9448.4] + - [806, 9448.4] - - [1024, 3475, 1, 4096] - - [719, 8404.87] + - [825, 8404.87] - - [9, 9, 6544, 64] - - [734, 425.854] + - [840, 425.854] - - [33708, 3955, 1, 1024] - - [701, 10088.4] + - [807, 10088.4] - - [4096, 3446, 1, 1024] - - [700, 9520.06] + - [806, 9520.06] - - [1024, 3138, 1, 4096] - - [719, 8053.44] + - [825, 8053.44] - - [1024, 3549, 1, 4096] - - [705, 8426.42] + - [811, 8426.42] - - [4096, 3287, 1, 1024] - - [701, 9751.34] + - [807, 9751.34] - - [1024, 3342, 1, 4096] - - [718, 8320.01] + - [824, 8320.01] - - [102, 102, 624, 64] - - [733, 4747.52] + - [839, 4747.52] - - [4096, 3519, 1, 1024] - - [700, 9716.1] + - [806, 9716.1] - - [4096, 3552, 1, 1024] - - [698, 9806.69] + - [804, 9806.69] - - [4096, 3859, 1, 1024] - - [698, 9369.94] + - [804, 9369.94] - - [33708, 3969, 1, 1024] - - [698, 9830.39] + - [804, 9830.39] - - [1024, 3369, 1, 4096] - - [719, 8379.26] + - [825, 8379.26] - - [4096, 3482, 1, 1024] - - [698, 9631.7] + - [804, 9631.7] - - [1024, 3306, 1, 4096] - - [721, 8320.06] + - [827, 8320.06] - - [1024, 3474, 1, 4096] - - [720, 8498.9] + - [826, 8498.9] - - [99, 99, 624, 64] - - [732, 4492.9] + - [838, 4492.9] - - [4096, 3377, 1, 1024] - - [698, 9369.92] + - [804, 9369.92] - - [4096, 3426, 1, 1024] - - [698, 9467.3] + - [804, 9467.3] - - [4096, 2935, 1, 1024] - - [699, 9423.74] + - [805, 9423.74] - - [4096, 3267, 1, 1024] - - [698, 9698.04] + - [804, 9698.04] - - [1024, 3299, 1, 4096] - - [719, 8264.76] + - [825, 8264.76] - - [1024, 3456, 1, 4096] - - [718, 8678.39] + - [824, 8678.39] - - [1024, 3280, 1, 4096] - - [719, 8220.69] + - [825, 8220.69] - - [1024, 3555, 1, 4096] - - [718, 8656.27] + - [824, 8656.27] - - [4096, 3499, 1, 1024] - - [700, 9663.93] + - [806, 9663.93] - - [4096, 3356, 1, 1024] - - [700, 9296.9] + - [806, 9296.9] - - [100, 102, 624, 64] - - [733, 4671.51] + - [839, 4671.51] - - [1024, 3412, 1, 4096] - - [721, 8538.05] + - [827, 8538.05] - - [1024, 2984, 1, 4096] - - [704, 9193.17] + - [810, 9193.17] - - [4096, 3141, 1, 1024] - - [700, 9349.43] + - [806, 9349.43] - - [4096, 3510, 1, 1024] - - [698, 9701.98] + - [804, 9701.98] - - [1024, 3995, 1, 1024] - - [702, 9243.4] + - [808, 9243.4] - - [1024, 3517, 1, 4096] - - [720, 8569.31] + - [826, 8569.31] - - [1024, 3455, 1, 4096] - - [720, 8560.67] + - [826, 8560.67] - - [1024, 3939, 1, 1024] - - [703, 9030.94] + - [809, 9030.94] - - [38, 38, 1680, 64] - - [732, 2459.84] + - [838, 2459.84] - - [1024, 3447, 1, 4096] - - [718, 8610.02] + - [824, 8610.02] - - [1024, 3969, 1, 4096] - - [705, 9097.33] + - [811, 9097.33] - - [4096, 3527, 1, 1024] - - [700, 9743.83] + - [806, 9743.83] - - [4096, 3336, 1, 1024] - - [700, 9248.33] + - [806, 9248.33] - - [1024, 3191, 1, 4096] - - [718, 8104.96] + - [824, 8104.96] - - [1024, 3302, 1, 4096] - - [719, 8245.09] + - [825, 8245.09] - - [1024, 3337, 1, 4096] - - [721, 8254.25] + - [827, 8254.25] - - [4096, 3290, 1, 1024] - - [700, 9759.13] + - [806, 9759.13] - - [1024, 3512, 1, 4096] - - [709, 8641.06] + - [815, 8641.06] - - [1024, 3433, 1, 4096] - - [719, 8444.7] + - [825, 8444.7] - - [4096, 3876, 1, 1024] - - [699, 9420.38] + - [805, 9420.38] - - [4096, 3490, 1, 1024] - - [700, 9641.11] + - [806, 9641.11] - - [4096, 3064, 1, 1024] - - [700, 9820.49] + - [806, 9820.49] - - [1024, 3508, 1, 4096] - - [715, 8442.24] + - [821, 8442.24] - - [1024, 3956, 1, 4096] - - [700, 9128.19] + - [806, 9128.19] - - [4096, 3417, 1, 1024] - - [700, 9448.41] + - [806, 9448.41] - - [1024, 3248, 1, 4096] - - [719, 8006.16] + - [825, 8006.16] - - [1024, 2499, 1, 4096] - - [719, 8155.19] + - [825, 8155.19] - - [1024, 3186, 1, 4096] - - [719, 8093.04] + - [825, 8093.04] - - [1024, 3180, 1, 4096] - - [721, 8097.02] + - [827, 8097.02] - - [4096, 3364, 1, 1024] - - [700, 9318.08] + - [806, 9318.08] - - [4096, 3976, 1, 1024] - - [700, 9654.47] + - [806, 9654.47] - - [4096, 3205, 1, 1024] - - [701, 9538.84] + - [807, 9538.84] - - [4096, 3318, 1, 1024] - - [698, 9838.29] + - [804, 9838.29] - - [1024, 3377, 1, 4096] - - [721, 8445.64] + - [827, 8445.64] - - [1024, 3485, 1, 4096] - - [718, 8368.83] + - [824, 8368.83] - - [4096, 3181, 1, 1024] - - [701, 9458.29] + - [807, 9458.29] - - [4096, 3550, 1, 1024] - - [698, 9783.14] + - [804, 9783.14] - - [1024, 3534, 1, 4096] - - [707, 8684.99] + - [813, 8684.99] - - [1024, 3860, 1, 1024] - - [702, 8923.18] + - [808, 8923.18] - - [160, 160, 400, 64] - - [745, 5797.69] + - [851, 5797.69] - - [4096, 3445, 1, 1024] - - [700, 9511.28] + - [806, 9511.28] - - [1024, 3391, 1, 4096] - - [721, 8541.77] + - [827, 8541.77] - - [1024, 3221, 1, 4096] - - [719, 8055.5] + - [825, 8055.5] - - [4096, 3079, 1, 1024] - - [698, 9181.04] + - [804, 9181.04] - - [4096, 3144, 1, 1024] - - [700, 9351.45] + - [806, 9351.45] - - [1024, 3270, 1, 4096] - - [720, 8367.63] + - [826, 8367.63] - - [1024, 3561, 1, 4096] - - [720, 8426.29] + - [826, 8426.29] - - [1024, 3480, 1, 4096] - - [707, 8465.0] + - [813, 8465.0] - - [4096, 3408, 1, 1024] - - [700, 9420.04] + - [806, 9420.04] - - [1024, 3418, 1, 4096] - - [721, 8481.02] + - [827, 8481.02] - - [4096, 3298, 1, 1024] - - [701, 9788.4] + - [807, 9788.4] - - [1024, 3640, 1, 1024] - - [704, 8435.44] + - [810, 8435.44] - - [1024, 3449, 1, 4096] - - [719, 8590.87] + - [825, 8590.87] - - [1024, 4020, 1, 4096] - - [697, 9168.13] + - [803, 9168.13] - - [4096, 3481, 1, 1024] - - [698, 9627.91] + - [804, 9627.91] - - [4096, 3530, 1, 1024] - - [700, 9734.68] + - [806, 9734.68] - - [1024, 3216, 1, 4096] - - [721, 8014.32] + - [827, 8014.32] - - [1024, 3840, 1, 1024] - - [704, 8908.37] + - [810, 8908.37] - - [1024, 3491, 1, 4096] - - [707, 8410.59] + - [813, 8410.59] - - [1024, 3154, 1, 4096] - - [720, 8095.69] + - [826, 8095.69] - - [4096, 3425, 1, 1024] - - [700, 9474.53] + - [806, 9474.53] - - [1024, 3348, 1, 4096] - - [718, 8202.9] + - [824, 8202.9] - - [1024, 3415, 1, 4096] - - [719, 8597.68] + - [825, 8597.68] - - [1024, 4026, 1, 1024] - - [702, 9279.09] + - [808, 9279.09] - - [1024, 3367, 1, 4096] - - [721, 8335.54] + - [827, 8335.54] - - [1024, 3259, 1, 4096] - - [721, 8285.3] + - [827, 8285.3] - - [1024, 3894, 1, 4096] - - [704, 9040.44] + - [810, 9040.44] - - [4096, 3355, 1, 1024] - - [699, 9291.67] + - [805, 9291.67] - - [4096, 3404, 1, 1024] - - [700, 9410.47] + - [806, 9410.47] - - [1024, 3308, 1, 4096] - - [721, 8336.3] + - [827, 8336.3] - - [4096, 3245, 1, 1024] - - [699, 9641.47] + - [805, 9641.47] - - [1024, 3502, 1, 4096] - - [720, 8375.9] + - [826, 8375.9] - - [33708, 4032, 1, 1024] - - [699, 9988.2] + - [805, 9988.2] - - [8, 8, 7280, 64] - - [736, 339.878] + - [842, 339.878] - - [1024, 3424, 1, 4096] - - [707, 8489.48] + - [813, 8489.48] - - [4096, 3509, 1, 1024] - - [699, 9702.29] + - [805, 9702.29] - - [4096, 3558, 1, 1024] - - [700, 9815.51] + - [806, 9815.51] - - [1024, 3900, 1, 1024] - - [703, 9014.05] + - [809, 9014.05] - - [1024, 2505, 1, 4096] - - [717, 8263.75] + - [823, 8263.75] - - [4096, 3472, 1, 1024] - - [698, 9609.61] + - [804, 9609.61] - - [1024, 3386, 1, 4096] - - [718, 8417.55] + - [824, 8417.55] - - [4096, 3383, 1, 1024] - - [700, 9364.77] + - [806, 9364.77] - - [4096, 3448, 1, 1024] - - [701, 9521.07] + - [807, 9521.07] - - [4096, 4030, 1, 1024] - - [701, 9771.56] + - [807, 9771.56] - - [4096, 3289, 1, 1024] - - [698, 9757.27] + - [804, 9757.27] - - [1024, 3459, 1, 4096] - - [720, 8422.12] + - [826, 8422.12] - - [1024, 2918, 1, 4096] - - [705, 9022.71] + - [811, 9022.71] - - [4096, 3489, 1, 1024] - - [698, 9641.9] + - [804, 9641.9] - - [4096, 3346, 1, 1024] - - [700, 9271.65] + - [806, 9271.65] - - [4096, 3572, 1, 1024] - - [700, 9829.82] + - [806, 9829.82] - - [1024, 3955, 1, 4096] - - [701, 9221.66] + - [807, 9221.66] - - [4096, 3236, 1, 1024] - - [698, 9620.72] + - [804, 9620.72] - - [4096, 3163, 1, 1024] - - [698, 9397.3] + - [804, 9397.3] - - [4096, 3468, 1, 1024] - - [698, 9601.58] + - [804, 9601.58] - - [1024, 3165, 1, 4096] - - [720, 7941.58] + - [826, 7941.58] - - [1024, 3276, 1, 4096] - - [720, 8244.96] + - [826, 8244.96] - - [1024, 3359, 1, 4096] - - [718, 8273.93] + - [824, 8273.93] - - [4096, 3363, 1, 1024] - - [700, 9315.8] + - [806, 9315.8] - - [1024, 3385, 1, 4096] - - [712, 8286.2] + - [818, 8286.2] - - [1024, 3207, 1, 4096] - - [721, 8144.02] + - [827, 8144.02] - - [1024, 3458, 1, 4096] - - [720, 8472.41] + - [826, 8472.41] - - [21, 21, 2976, 64] - - [736, 2083.3] + - [842, 2083.3] - - [4096, 3110, 1, 1024] - - [698, 9260.3] + - [804, 9260.3] - - [4096, 3925, 1, 1024] - - [701, 9526.66] + - [807, 9526.66] - - [1024, 3975, 1, 4096] - - [696, 9133.84] + - [802, 9133.84] - - [4096, 3549, 1, 1024] - - [700, 9793.77] + - [806, 9793.77] - - [4096, 3342, 1, 1024] - - [699, 9264.48] + - [805, 9264.48] - - [1024, 3859, 1, 1024] - - [702, 8933.47] + - [808, 8933.47] - - [1024, 3497, 1, 4096] - - [719, 8526.13] + - [825, 8526.13] - - [4096, 3280, 1, 1024] - - [700, 9733.32] + - [806, 9733.32] - - [1024, 3435, 1, 4096] - - [719, 8489.85] + - [825, 8489.85] - - [1024, 3354, 1, 4096] - - [719, 8248.83] + - [825, 8248.83] - - [4096, 3191, 1, 1024] - - [699, 9475.12] + - [805, 9475.12] - - [4096, 3512, 1, 1024] - - [698, 9701.37] + - [804, 9701.37] - - [1024, 3055, 1, 4096] - - [705, 9264.91] + - [811, 9264.91] - - [4096, 2499, 1, 1024] - - [700, 9574.06] + - [806, 9574.06] - - [1024, 3233, 1, 4096] - - [718, 8101.74] + - [824, 8101.74] - - [4096, 3423, 1, 1024] - - [701, 9463.5] + - [807, 9463.5] - - [1024, 3319, 1, 4096] - - [721, 8413.76] + - [827, 8413.76] - - [4096, 3297, 1, 1024] - - [698, 9782.66] + - [804, 9782.66] - - [4096, 3154, 1, 1024] - - [700, 9381.2] + - [806, 9381.2] - - [1024, 3540, 1, 4096] - - [721, 8507.53] + - [827, 8507.53] - - [1024, 3289, 1, 4096] - - [721, 8233.8] + - [827, 8233.8] - - [4096, 3529, 1, 1024] - - [700, 9741.15] + - [806, 9741.15] - - [4096, 3386, 1, 1024] - - [700, 9372.57] + - [806, 9372.57] - - [4096, 3276, 1, 1024] - - [698, 9713.76] + - [804, 9713.76] - - [1024, 3244, 1, 4096] - - [721, 8146.83] + - [827, 8146.83] - - [1024, 3182, 1, 4096] - - [718, 8115.12] + - [824, 8115.12] - - [4096, 3540, 1, 1024] - - [698, 9768.42] + - [804, 9768.42] - - [1024, 3360, 1, 4096] - - [720, 8353.31] + - [826, 8353.31] - - [1024, 3942, 1, 4096] - - [699, 9143.78] + - [805, 9143.78] - - [4096, 3403, 1, 1024] - - [701, 9412.18] + - [807, 9412.18] - - [4096, 3101, 1, 1024] - - [701, 9239.28] + - [807, 9239.28] - - [4096, 2918, 1, 1024] - - [700, 9373.75] + - [806, 9373.75] - - [1024, 3465, 1, 4096] - - [721, 8288.16] + - [827, 8288.16] - - [33708, 3780, 1, 1024] - - [700, 9971.91] + - [806, 9971.91] - - [4096, 3557, 1, 1024] - - [698, 9814.82] + - [804, 9814.82] - - [4096, 3414, 1, 1024] - - [698, 9436.63] + - [804, 9436.63] - - [1024, 3948, 1, 1024] - - [702, 9073.8] + - [808, 9073.8] - - [4096, 3320, 1, 1024] - - [700, 9834.77] + - [806, 9834.77] - - [4096, 2765, 1, 1024] - - [700, 9667.06] + - [806, 9667.06] - - [1024, 3978, 1, 4096] - - [695, 9109.6] + - [801, 9109.6] - - [4096, 3487, 1, 1024] - - [698, 9644.0] + - [804, 9644.0] - - [4096, 3520, 1, 1024] - - [700, 9728.08] + - [806, 9728.08] - - [1024, 3139, 1, 4096] - - [720, 7940.19] + - [826, 7940.19] - - [1024, 3314, 1, 4096] - - [718, 8294.01] + - [824, 8294.01] - - [4096, 3431, 1, 1024] - - [700, 9482.12] + - [806, 9482.12] - - [123, 122, 528, 64] - - [733, 6325.98] + - [839, 6325.98] - - [1024, 3446, 1, 4096] - - [714, 8468.34] + - [820, 8468.34] - - [1024, 4059, 1, 4096] - - [701, 9370.8] + - [807, 9370.8] - - [99, 102, 624, 64] - - [733, 4624.8] + - [839, 4624.8] - - [4096, 3345, 1, 1024] - - [698, 9271.32] + - [804, 9271.32] - - [4096, 3394, 1, 1024] - - [698, 9398.19] + - [804, 9398.19] - - [1024, 3927, 1, 1024] - - [703, 9041.38] + - [809, 9041.38] - - [4096, 3235, 1, 1024] - - [698, 9619.93] + - [804, 9619.93] - - [1024, 3328, 1, 4096] - - [719, 8406.09] + - [825, 8406.09] - - [33708, 3956, 1, 1024] - - [699, 10100.4] + - [805, 10100.4] - - [4096, 3467, 1, 1024] - - [700, 9586.66] + - [806, 9586.66] - - [1024, 3287, 1, 4096] - - [720, 8273.83] + - [826, 8273.83] - - [4096, 3214, 1, 1024] - - [701, 9557.49] + - [807, 9557.49] - - [4096, 3910, 1, 1024] - - [698, 9490.25] + - [804, 9490.25] - - [1024, 3780, 1, 1024] - - [705, 8706.0] + - [811, 8706.0] - - [1024, 3371, 1, 4096] - - [721, 8248.46] + - [827, 8248.46] - - [4096, 3478, 1, 1024] - - [701, 9619.62] + - [807, 9619.62] - - [1024, 3546, 1, 4096] - - [719, 8456.83] + - [825, 8456.83] - - [1024, 4012, 1, 1024] - - [702, 9253.34] + - [808, 9253.34] - - [4096, 3341, 1, 1024] - - [700, 9260.24] + - [806, 9260.24] - - [4096, 3454, 1, 1024] - - [698, 9533.62] + - [804, 9533.62] - - [4096, 3295, 1, 1024] - - [701, 9772.86] + - [807, 9772.86] - - [4096, 3072, 1, 1024] - - [698, 9887.23] + - [804, 9887.23] - - [1024, 3282, 1, 4096] - - [706, 8112.85] + - [812, 8112.85] - - [33708, 3720, 1, 1024] - - [701, 9818.85] + - [807, 9818.85] - - [1024, 3681, 1, 4096] - - [703, 8639.28] + - [809, 8639.28] - - [1024, 4050, 1, 4096] - - [701, 9291.93] + - [807, 9291.93] - - [4096, 3495, 1, 1024] - - [700, 9660.52] + - [806, 9660.52] - - [4096, 3560, 1, 1024] - - [699, 9813.8] + - [805, 9813.8] - - [4096, 3751, 1, 1024] - - [698, 9684.95] + - [804, 9684.95] - - [1024, 3414, 1, 4096] - - [719, 8555.72] + - [825, 8555.72] - - [33708, 3860, 1, 1024] - - [698, 9856.68] + - [804, 9856.68] - - [1024, 3325, 1, 4096] - - [708, 8261.21] + - [814, 8261.21] - - [4096, 3458, 1, 1024] - - [698, 9570.86] + - [804, 9570.86] - - [4096, 2967, 1, 1024] - - [698, 9544.61] + - [804, 9544.61] - - [1024, 3519, 1, 4096] - - [721, 8413.1] + - [827, 8413.1] - - [4096, 3385, 1, 1024] - - [700, 9367.34] + - [806, 9367.34] - - [4096, 3434, 1, 1024] - - [698, 9488.41] + - [804, 9488.41] - - [1024, 3552, 1, 4096] - - [719, 8456.13] + - [825, 8456.13] - - [4096, 3822, 1, 1024] - - [699, 9849.84] + - [805, 9849.84] - - [1024, 3544, 1, 4096] - - [718, 8494.56] + - [824, 8494.56] - - [4096, 3539, 1, 1024] - - [700, 9763.09] + - [806, 9763.09] - - [4096, 3332, 1, 1024] - - [698, 9232.36] + - [804, 9232.36] - - [1024, 3145, 1, 4096] - - [718, 8098.36] + - [824, 8098.36] - - [1024, 3535, 1, 4096] - - [706, 8592.8] + - [812, 8592.8] - - [1024, 3320, 1, 4096] - - [719, 8419.55] + - [825, 8419.55] - - [33708, 4012, 1, 1024] - - [701, 9940.2] + - [807, 9940.2] - - [4096, 3286, 1, 1024] - - [700, 9747.82] + - [806, 9747.82] - - [1024, 3514, 1, 4096] - - [719, 8653.69] + - [825, 8653.69] - - [93, 93, 688, 64] - - [740, 5005.79] + - [846, 5005.79] - - [1024, 2765, 1, 4096] - - [705, 8636.72] + - [811, 8636.72] - - [1024, 3452, 1, 4096] - - [718, 8445.87] + - [824, 8445.87] - - [4096, 3518, 1, 1024] - - [698, 9722.56] + - [804, 9722.56] - - [1024, 3529, 1, 4096] - - [718, 8444.32] + - [824, 8444.32] - - [4096, 3413, 1, 1024] - - [698, 9436.35] + - [804, 9436.35] - - [33708, 4050, 1, 1024] - - [700, 10026.7] + - [806, 10026.7] - - [1024, 3525, 1, 4096] - - [711, 8488.99] + - [817, 8488.99] - - [4096, 3303, 1, 1024] - - [698, 9791.05] + - [804, 9791.05] - - [1024, 3382, 1, 4096] - - [719, 8483.63] + - [825, 8483.63] - - [1024, 3390, 1, 4096] - - [718, 8552.81] + - [824, 8552.81] - - [1024, 3977, 1, 4096] - - [700, 9053.53] + - [806, 9053.53] - - [1024, 3184, 1, 4096] - - [718, 8008.81] + - [824, 8008.81] - - [4096, 3535, 1, 1024] - - [700, 9760.79] + - [806, 9760.79] - - [4096, 3376, 1, 1024] - - [701, 9341.93] + - [807, 9341.93] - - [4096, 3978, 1, 1024] - - [701, 9642.8] + - [807, 9642.8] - - [1024, 3136, 1, 4096] - - [720, 8085.12] + - [826, 8085.12] - - [1024, 3293, 1, 4096] - - [718, 8300.49] + - [824, 8300.49] - - [4096, 3266, 1, 1024] - - [699, 9691.78] + - [805, 9691.78] - - [1024, 3487, 1, 4096] - - [718, 8383.62] + - [824, 8383.62] - - [1024, 3409, 1, 4096] - - [720, 8493.25] + - [826, 8493.25] - - [4096, 3498, 1, 1024] - - [699, 9672.38] + - [805, 9672.38] - - [1024, 3520, 1, 4096] - - [721, 8488.26] + - [827, 8488.26] - - [1024, 3530, 1, 4096] - - [702, 8409.87] + - [808, 8409.87] - - [4096, 3393, 1, 1024] - - [700, 9395.43] + - [806, 9395.43] - - [4096, 3140, 1, 1024] - - [700, 9338.5] + - [806, 9338.5] - - [1024, 3536, 1, 4096] - - [721, 8642.11] + - [827, 8642.11] - - [1024, 3288, 1, 4096] - - [721, 8229.34] + - [827, 8229.34] - - [1024, 4005, 1, 4096] - - [703, 9271.04] + - [809, 9271.04] - - [1024, 3579, 1, 4096] - - [707, 8844.5] + - [813, 8844.5] - - [4096, 3372, 1, 1024] - - [698, 9339.25] + - [804, 9339.25] - - [1024, 3440, 1, 4096] - - [718, 8466.69] + - [824, 8466.69] - - [4096, 3213, 1, 1024] - - [701, 9558.85] + - [807, 9558.85] - - [123, 123, 528, 64] - - [733, 6333.59] + - [839, 6333.59] - - [100, 100, 624, 64] - - [732, 4584.12] + - [838, 4584.12] - - [1024, 3968, 1, 4096] - - [699, 9237.6] + - [805, 9237.6] - - [4096, 3477, 1, 1024] - - [699, 9618.88] + - [805, 9618.88] - - [4096, 3526, 1, 1024] - - [698, 9735.94] + - [804, 9735.94] - - [1024, 3493, 1, 4096] - - [719, 8355.13] + - [825, 8355.13] - - [1024, 3944, 1, 4096] - - [694, 9065.39] + - [800, 9065.39] - - [4096, 3453, 1, 1024] - - [699, 9533.37] + - [805, 9533.37] - - [1024, 3350, 1, 4096] - - [721, 8448.64] + - [827, 8448.64] - - [4096, 3184, 1, 1024] - - [700, 9447.38] + - [806, 9447.38] - - [1024, 3423, 1, 4096] - - [719, 8465.38] + - [825, 8465.38] - - [4096, 3351, 1, 1024] - - [698, 9282.06] + - [804, 9282.06] - - [4096, 3416, 1, 1024] - - [698, 9446.64] + - [804, 9446.64] - - [1024, 3796, 1, 4096] - - [700, 8820.34] + - [806, 8820.34] - - [4096, 3257, 1, 1024] - - [698, 9671.64] + - [804, 9671.64] - - [4096, 3306, 1, 1024] - - [700, 9795.51] + - [806, 9795.51] - - [33708, 4020, 1, 1024] - - [700, 9961.85] + - [806, 9961.85] - - [19, 19, 3264, 64] - - [730, 1736.09] + - [836, 1736.09] - - [1024, 3426, 1, 4096] - - [718, 8518.61] + - [824, 8518.61] - - [4096, 3457, 1, 1024] - - [698, 9564.56] + - [804, 9564.56] - - [1024, 2935, 1, 4096] - - [703, 9067.79] + - [809, 9067.79] - - [1024, 3046, 1, 4096] - - [703, 9242.97] + - [809, 9242.97] - - [4096, 3433, 1, 1024] - - [700, 9495.65] + - [806, 9495.65] - - [1024, 3256, 1, 4096] - - [721, 8224.23] + - [827, 8224.23] - - [1024, 3531, 1, 4096] - - [718, 8524.19] + - [824, 8524.19] - - [4096, 3180, 1, 1024] - - [698, 9443.53] + - [804, 9443.53] - - [1024, 3388, 1, 4096] - - [720, 8352.82] + - [826, 8352.82] - - [4096, 3444, 1, 1024] - - [701, 9511.03] + - [807, 9511.03] - - [1024, 3501, 1, 4096] - - [708, 8461.12] + - [814, 8461.12] - - [1024, 3266, 1, 4096] - - [706, 8147.44] + - [812, 8147.44] - - [1024, 3267, 1, 4096] - - [721, 8391.49] + - [827, 8391.49] - - [1024, 3461, 1, 4096] - - [705, 8270.29] + - [811, 8270.29] - - [4096, 3870, 1, 1024] - - [700, 9399.69] + - [806, 9399.69] - - [4096, 3517, 1, 1024] - - [698, 9725.43] + - [804, 9725.43] - - [1024, 3566, 1, 4096] - - [721, 8669.76] + - [827, 8669.76] - - [4096, 3574, 1, 1024] - - [698, 9844.63] + - [804, 9844.63] - - [1024, 3876, 1, 1024] - - [703, 8961.74] + - [809, 8961.74] - - [25, 25, 2512, 64] - - [729, 2472.54] + - [835, 2472.54] - - [4096, 3720, 1, 1024] - - [698, 9612.49] + - [804, 9612.49] - - [4096, 3248, 1, 1024] - - [700, 9644.92] + - [806, 9644.92] - - [4096, 4059, 1, 1024] - - [698, 9826.42] + - [804, 9826.42] - - [1024, 3380, 1, 4096] - - [719, 8677.91] + - [825, 8677.91] - - [4096, 3480, 1, 1024] - - [700, 9626.16] + - [806, 9626.16] - - [1024, 3335, 1, 4096] - - [720, 8302.18] + - [826, 8302.18] - - [1024, 3345, 1, 4096] - - [720, 8323.13] + - [826, 8323.13] - - [4096, 3391, 1, 1024] - - [698, 9379.48] + - [804, 9379.48] - - [4096, 3424, 1, 1024] - - [700, 9466.77] + - [806, 9466.77] - - [1024, 3394, 1, 4096] - - [706, 8373.91] + - [812, 8373.91] - - [4096, 3265, 1, 1024] - - [700, 9700.89] + - [806, 9700.89] - - [1024, 3014, 1, 4096] - - [703, 9303.09] + - [809, 9303.09] - - [4096, 3497, 1, 1024] - - [698, 9668.6] + - [804, 9668.6] - - [4096, 3354, 1, 1024] - - [700, 9294.31] + - [806, 9294.31] - - [4096, 3055, 1, 1024] - - [699, 9780.88] + - [805, 9780.88] - - [1024, 3499, 1, 4096] - - [712, 8527.04] + - [818, 8527.04] - - [1024, 3162, 1, 4096] - - [720, 8059.02] + - [826, 8059.02] - - [4096, 3244, 1, 1024] - - [700, 9636.86] + - [806, 9636.86] - - [1024, 3437, 1, 4096] - - [719, 8583.41] + - [825, 8583.41] - - [1024, 3356, 1, 4096] - - [721, 8296.95] + - [827, 8296.95] - - [4096, 3139, 1, 1024] - - [700, 9338.7] + - [806, 9338.7] - - [4096, 3508, 1, 1024] - - [700, 9700.54] + - [806, 9700.54] - - [1024, 3235, 1, 4096] - - [718, 8314.59] + - [824, 8314.59] - - [1024, 3910, 1, 4096] - - [705, 9200.21] + - [811, 9200.21] - - [4096, 3371, 1, 1024] - - [698, 9336.97] + - [804, 9336.97] - - [1024, 3751, 1, 4096] - - [705, 8827.67] + - [811, 8827.67] - - [4096, 3325, 1, 1024] - - [698, 9845.68] + - [804, 9845.68] - - [1024, 3413, 1, 4096] - - [706, 8345.78] + - [812, 8345.78] - - [1024, 3542, 1, 4096] - - [718, 8521.71] + - [824, 8521.71] - - [18, 18, 3440, 64] - - [734, 1578.24] + - [840, 1578.24] - - [101, 102, 624, 64] - - [732, 4705.28] + - [838, 4705.28] - - [33708, 3900, 1, 1024] - - [698, 9951.05] + - [804, 9951.05] - - [4096, 3525, 1, 1024] - - [699, 9744.47] + - [805, 9744.47] - - [4096, 3382, 1, 1024] - - [699, 9359.03] + - [805, 9359.03] - - [102, 100, 624, 64] - - [733, 4671.51] + - [839, 4671.51] - - [15, 15, 4096, 64] - - [737, 1129.17] + - [843, 1129.17] - - [1024, 3339, 1, 4096] - - [707, 8326.37] + - [813, 8326.37] - - [4096, 3288, 1, 1024] - - [700, 9761.48] + - [806, 9761.48] - - [92, 92, 688, 64] - - [740, 4903.87] + - [846, 4903.87] - - [1024, 3141, 1, 4096] - - [718, 7975.64] + - [824, 7975.64] - - [1024, 3168, 1, 4096] - - [718, 8083.74] + - [824, 8083.74] - - [4096, 3488, 1, 1024] - - [700, 9646.77] + - [806, 9646.77] - - [4096, 3046, 1, 1024] - - [699, 9767.58] + - [805, 9767.58] - - [1024, 3362, 1, 4096] - - [721, 8458.15] + - [827, 8458.15] - - [33708, 3942, 1, 1024] - - [699, 10060.4] + - [805, 10060.4] - - [4096, 3399, 1, 1024] - - [700, 9406.57] + - [806, 9406.57] - - [1024, 3720, 1, 1024] - - [702, 8639.16] + - [808, 8639.16] - - [4096, 3563, 1, 1024] - - [698, 9836.55] + - [804, 9836.55] - - [1024, 3273, 1, 4096] - - [721, 8221.62] + - [827, 8221.62] - - [4096, 3162, 1, 1024] - - [700, 9400.19] + - [806, 9400.19] - - [1024, 3467, 1, 4096] - - [719, 8342.42] + - [825, 8342.42] - - [1024, 3130, 1, 4096] - - [720, 7933.88] + - [826, 7933.88] - - [1024, 3405, 1, 4096] - - [727, 8406.59] + - [833, 8406.59] - - [4096, 3362, 1, 1024] - - [698, 9312.04] + - [804, 9312.04] - - [1024, 3960, 1, 1024] - - [702, 9082.26] + - [808, 9082.26] - - [2048, 128, 1, 4096] - - [752, 5986.62] + - [858, 5986.62] - - [1024, 3712, 1, 36548] - - [750, 9456.25] + - [856, 9456.25] - - [1024, 128, 1, 1024] - - [753, 3631.53] + - [859, 3631.53] - - [3072, 128, 1, 4096] - - [749, 6145.6] + - [855, 6145.6] - - [1024, 3712, 1, 1024] - - [751, 8933.98] + - [857, 8933.98] - - [256, 256, 192, 64] - - [756, 8264.74] + - [862, 8264.74] - - [768, 4096, 1, 768] - - [769, 9642.18] + - [875, 9642.18] - - [768, 64, 1, 768] - - [766, 1850.53] + - [872, 1850.53] - - [768, 1280, 1, 768] - - [769, 8738.23] + - [875, 8738.23] - - [30522, 320, 1, 768] - - [770, 9733.69] + - [876, 9733.69] - - [128, 128, 96, 64] - - [759, 5470.93] + - [865, 5470.93] - - [2, 16, 1, 768] - - [762, 2.57742] + - [868, 2.57742] - - [30522, 1280, 1, 768] - - [768, 10128.0] + - [874, 10128.0] - - [30522, 640, 1, 768] - - [769, 9987.71] + - [875, 9987.71] - - [2, 8, 1, 768] - - [761, 1.06] + - [867, 1.06] - - [768, 4096, 1, 3072] - - [771, 9479.51] + - [877, 9479.51] - - [768, 32, 1, 768] - - [765, 880.434] + - [871, 880.434] - - [2, 64, 1, 768] - - [762, 10.09024] + - [868, 10.09024] - - [256, 256, 96, 64] - - [756, 7614.57] + - [862, 7614.57] - - [64, 64, 768, 64] - - [758, 5354.53] + - [864, 5354.53] - - [30522, 160, 1, 768] - - [767, 7740.21] + - [873, 7740.21] - - [768, 320, 1, 768] - - [760, 5423.77] + - [866, 5423.77] - - [128, 128, 384, 64] - - [757, 7180.08] + - [863, 7180.08] - - [768, 16, 1, 768] - - [763, 706.476] + - [869, 706.476] - - [3072, 4096, 1, 768] - - [772, 9961.84] + - [878, 9961.84] - - [2048, 512, 1, 100] - - [774, 5180.81] + - [880, 5180.81] - - [1024, 200, 1, 560] - - [775, 4061.29] + - [881, 4061.29] - - [256, 1280, 1, 1024] - - [782, 4337.54] + - [888, 4337.54] - - [256, 44505, 1, 1024] - - [818, 8597.79] + - [924, 8597.79] - - [10240, 8976, 1, 256] - - [821, 9471.53] + - [927, 9471.53] - - [256, 7168, 1, 1024] - - [812, 6718.66] + - [918, 6718.66] - - [8448, 8976, 1, 256] - - [804, 9601.41] + - [910, 9601.41] - - [18944, 8976, 1, 256] - - [813, 9666.36] + - [919, 9666.36] - - [256, 19200, 1, 1024] - - [789, 7489.04] + - [895, 7489.04] - - [5632, 8976, 1, 256] - - [801, 9358.49] + - [907, 9358.49] - - [256, 23552, 1, 1024] - - [816, 7980.99] + - [922, 7980.99] - - [256, 6656, 1, 1024] - - [816, 6287.32] + - [922, 6287.32] - - [256, 14336, 1, 1024] - - [811, 7049.36] + - [917, 7049.36] - - [256, 12544, 1, 1024] - - [789, 6728.57] + - [895, 6728.57] - - [2048, 684, 1, 768] - - [806, 8479.28] + - [912, 8479.28] - - [5376, 8976, 1, 256] - - [801, 9519.61] + - [907, 9519.61] - - [256, 5888, 1, 1024] - - [821, 6012.5] + - [927, 6012.5] - - [19968, 8976, 1, 256] - - [813, 9684.77] + - [919, 9684.77] - - [3840, 8976, 1, 256] - - [798, 9461.99] + - [904, 9461.99] - - [4608, 8976, 1, 256] - - [798, 9305.92] + - [904, 9305.92] - - [256, 684, 1, 1024] - - [824, 3513.16] + - [930, 3513.16] - - [256, 22016, 1, 1024] - - [789, 7643.89] + - [895, 7643.89] - - [256, 23296, 1, 1024] - - [818, 8048.22] + - [924, 8048.22] - - [4864, 8976, 1, 256] - - [796, 9545.72] + - [902, 9545.72] - - [256, 7424, 1, 1024] - - [814, 6770.75] + - [920, 6770.75] - - [18176, 8976, 1, 256] - - [821, 9729.57] + - [927, 9729.57] - - [256, 15104, 1, 1024] - - [810, 7289.18] + - [916, 7289.18] - - [8192, 8976, 1, 256] - - [813, 9395.59] + - [919, 9395.59] - - [256, 16128, 1, 1024] - - [813, 7461.38] + - [919, 7461.38] - - [13312, 8976, 1, 256] - - [821, 9551.07] + - [927, 9551.07] - - [256, 21504, 1, 1024] - - [818, 7636.03] + - [924, 7636.03] - - [6400, 8976, 1, 256] - - [805, 9561.06] + - [911, 9561.06] - - [256, 8960, 1, 1024] - - [780, 6292.46] + - [886, 6292.46] - - [1792, 8976, 1, 256] - - [795, 9372.28] + - [901, 9372.28] - - [13824, 8976, 1, 256] - - [813, 9585.37] + - [919, 9585.37] - - [11776, 8976, 1, 256] - - [813, 9560.44] + - [919, 9560.44] - - [256, 20992, 1, 1024] - - [811, 7490.75] + - [917, 7490.75] - - [20480, 8976, 1, 256] - - [821, 9610.8] + - [927, 9610.8] - - [5888, 8976, 1, 256] - - [792, 9565.3] + - [898, 9565.3] - - [256, 10496, 1, 1024] - - [783, 6632.06] + - [889, 6632.06] - - [21248, 8976, 1, 256] - - [813, 9755.87] + - [919, 9755.87] - - [5120, 8976, 1, 256] - - [821, 9244.69] + - [927, 9244.69] - - [7168, 8976, 1, 256] - - [813, 9388.52] + - [919, 9388.52] - - [2048, 1536, 1, 768] - - [802, 9446.14] + - [908, 9446.14] - - [256, 8192, 1, 1024] - - [807, 6948.99] + - [913, 6948.99] - - [4096, 8976, 1, 256] - - [812, 9116.04] + - [918, 9116.04] - - [3328, 8976, 1, 256] - - [805, 9434.65] + - [911, 9434.65] - - [1280, 8976, 1, 256] - - [803, 9129.9] + - [909, 9129.9] - - [2560, 8976, 1, 256] - - [800, 9199.58] + - [906, 9199.58] - - [3072, 8976, 1, 256] - - [815, 8963.7] + - [921, 8963.7] - - [256, 11776, 1, 1024] - - [793, 6869.9] + - [899, 6869.9] - - [18688, 8976, 1, 256] - - [821, 9726.31] + - [927, 9726.31] - - [15104, 8976, 1, 256] - - [821, 9715.81] + - [927, 9715.81] - - [23552, 8976, 1, 256] - - [813, 9648.52] + - [919, 9648.52] - - [6144, 8976, 1, 256] - - [821, 9339.9] + - [927, 9339.9] - - [12544, 8976, 1, 256] - - [821, 9654.55] + - [927, 9654.55] - - [256, 11264, 1, 1024] - - [794, 6815.08] + - [900, 6815.08] - - [2048, 114, 1, 512] - - [825, 4583.6] + - [931, 4583.6] - - [4352, 8976, 1, 256] - - [805, 9471.5] + - [911, 9471.5] - - [15360, 8976, 1, 256] - - [821, 9583.87] + - [927, 9583.87] - - [256, 31488, 1, 1024] - - [820, 8438.11] + - [926, 8438.11] - - [28672, 8976, 1, 256] - - [813, 9688.95] + - [919, 9688.95] - - [256, 18176, 1, 1024] - - [789, 7405.19] + - [895, 7405.19] - - [9728, 8976, 1, 256] - - [821, 9524.25] + - [927, 9524.25] - - [256, 2816, 1, 1024] - - [785, 5405.76] + - [891, 5405.76] - - [256, 18944, 1, 1024] - - [789, 7503.51] + - [895, 7503.51] - - [256, 3584, 1, 1024] - - [788, 6107.25] + - [894, 6107.25] - - [7936, 8976, 1, 256] - - [801, 9608.41] + - [907, 9608.41] - - [19712, 8976, 1, 256] - - [821, 9736.35] + - [927, 9736.35] - - [256, 14848, 1, 1024] - - [794, 7163.52] + - [900, 7163.52] - - [256, 8448, 1, 1024] - - [794, 6372.66] + - [900, 6372.66] - - [256, 6400, 1, 1024] - - [808, 6395.81] + - [914, 6395.81] - - [256, 6144, 1, 1024] - - [819, 6490.32] + - [925, 6490.32] - - [9472, 8976, 1, 256] - - [798, 9610.02] + - [904, 9610.02] - - [256, 9984, 1, 1024] - - [781, 6484.85] + - [887, 6484.85] - - [684, 8976, 1, 256] - - [790, 8128.63] + - [896, 8128.63] - - [20992, 8976, 1, 256] - - [813, 9689.75] + - [919, 9689.75] - - [2048, 684, 1, 512] - - [797, 7241.88] + - [903, 7241.88] - - [2048, 114, 1, 768] - - [823, 4872.56] + - [929, 4872.56] - - [8960, 8976, 1, 256] - - [796, 9603.45] + - [902, 9603.45] - - [2048, 1536, 1, 512] - - [799, 8830.21] + - [905, 8830.21] - - [256, 3328, 1, 1024] - - [787, 5612.65] + - [893, 5612.65] - - [33536, 8976, 1, 256] - - [813, 9797.81] + - [919, 9797.81] - - [2048, 8976, 1, 256] - - [813, 8975.56] + - [919, 8975.56] - - [10496, 8976, 1, 256] - - [804, 9654.53] + - [910, 9654.53] - - [256, 5376, 1, 1024] - - [822, 5626.44] + - [928, 5626.44] - - [256, 21248, 1, 1024] - - [791, 7525.55] + - [897, 7525.55] - - [256, 13312, 1, 1024] - - [789, 6767.21] + - [895, 6767.21] - - [16128, 8976, 1, 256] - - [813, 9715.67] + - [919, 9715.67] - - [2304, 8976, 1, 256] - - [786, 9433.93] + - [892, 9433.93] - - [256, 4864, 1, 1024] - - [776, 5743.65] + - [882, 5743.65] - - [17152, 8976, 1, 256] - - [821, 9709.04] + - [927, 9709.04] - - [15872, 8976, 1, 256] - - [821, 9657.67] + - [927, 9657.67] - - [9984, 8976, 1, 256] - - [798, 9639.84] + - [904, 9639.84] - - [256, 14592, 1, 1024] - - [810, 7224.02] + - [916, 7224.02] - - [256, 33536, 1, 1024] - - [817, 8147.41] + - [923, 8147.41] - - [11264, 8976, 1, 256] - - [813, 9510.06] + - [919, 9510.06] - - [31488, 8976, 1, 256] - - [821, 9799.41] + - [927, 9799.41] - - [256, 20480, 1, 1024] - - [794, 7498.3] + - [900, 7498.3] - - [44505, 8976, 1, 256] - - [805, 9804.88] + - [911, 9804.88] - - [13568, 8976, 1, 256] - - [813, 9680.34] + - [919, 9680.34] - - [256, 11520, 1, 1024] - - [793, 6805.36] + - [899, 6805.36] - - [256, 7936, 1, 1024] - - [809, 6971.87] + - [915, 6971.87] - - [2048, 256, 1, 768] - - [779, 7129.23] + - [885, 7129.23] - - [256, 4608, 1, 1024] - - [777, 5463.01] + - [883, 5463.01] - - [256, 2304, 1, 1024] - - [784, 4842.79] + - [890, 4842.79] - - [256, 2560, 1, 1024] - - [785, 5309.35] + - [891, 5309.35] - - [2816, 8976, 1, 256] - - [796, 9409.66] + - [902, 9409.66] + - - [1728, 320, 1, 64] + - [938, 3205.67] + - - [1152, 128, 1, 784] + - [985, 3499.06] + - - [576, 96, 1, 5329] + - [971, 3948.02] + - - [864, 96, 1, 1225] + - [992, 3009.77] + - - [256, 128, 1, 784] + - [982, 1536.59] + - - [1440, 320, 1, 196] + - [935, 4824.72] + - - [192, 48, 1, 1225] + - [1013, 820.565] + - - [2592, 384, 1, 289] + - [953, 7353.11] + - - [192, 80, 36, 10368] + - [1003, 5360.14] + - - [896, 192, 1, 289] + - [970, 3076.66] + - - [768, 128, 1, 289] + - [995, 2351.91] + - - [64, 256, 1, 3136] + - [1021, 1809.26] + - - [1280, 384, 1, 64] + - [935, 3171.2] + - - [512, 144, 1, 196] + - [993, 1445.17] + - - [1344, 192, 1, 289] + - [976, 4376.62] + - - [288, 64, 1, 21609] + - [987, 3396.22] + - - [400, 32, 1, 784] + - [1014, 922.453] + - - [288, 32, 1, 21609] + - [1025, 2816.11] + - - [1280, 448, 1, 64] + - [938, 3253.66] + - - [3456, 256, 1, 169] + - [950, 5822.54] + - - [2304, 256, 1, 196] + - [948, 4932.08] + - - [384, 192, 1, 1225] + - [996, 2720.49] + - - [832, 48, 1, 49] + - [991, 344.618] + - - [832, 192, 1, 49] + - [973, 1099.46] + - - [1280, 192, 1, 64] + - [974, 2069.66] + - - [192, 32, 1, 784] + - [1013, 459.727] + - - [288, 48, 1, 1225] + - [1020, 1176.1] + - - [512, 112, 1, 196] + - [988, 1277.31] + - - [224, 192, 36, 2592] + - [1005, 7369.66] + - - [528, 32, 1, 196] + - [979, 440.474] + - - [192, 128, 36, 1568] + - [1004, 8245.86] + - - [4032, 384, 1, 64] + - [949, 5898.34] + - - [576, 64, 1, 3136] + - [994, 2671.21] + - - [2048, 32, 1, 1001] + - [996, 2323.1] + - - [480, 64, 1, 196] + - [981, 752.74] + - - [512, 256, 1, 196] + - [983, 2528.65] + - - [864, 96, 1, 289] + - [993, 1958.5] + - - [896, 128, 1, 289] + - [996, 2725.83] + - - [192, 64, 1, 784] + - [1011, 898.775] + - - [1200, 64, 1, 1225] + - [995, 2780.24] + - - [1296, 288, 1, 196] + - [934, 3826.28] + - - [576, 96, 1, 5041] + - [975, 3795.68] + - - [1024, 256, 1, 289] + - [964, 4488.23] + - - [1024, 2048, 1, 49] + - [954, 5077.2] + - - [192, 64, 36, 6272] + - [998, 7515.08] + - - [4096, 512, 1, 4096] + - [960, 10276.1] + - - [192, 32, 1, 1225] + - [1014, 556.786] + - - [1024, 256, 1, 196] + - [974, 3892.54] + - - [1120, 192, 1, 289] + - [963, 3752.91] + - - [400, 48, 1, 196] + - [988, 480.1] + - - [1728, 224, 1, 1225] + - [941, 5575.87] + - - [800, 96, 1, 784] + - [995, 2669.04] + - - [1152, 384, 1, 64] + - [945, 3077.44] + - - [4608, 512, 1, 49] + - [952, 4676.7] + - - [1792, 256, 1, 289] + - [945, 5346.04] + - - [864, 128, 1, 784] + - [995, 3816.3] + - - [1728, 384, 1, 169] + - [947, 5191.78] + - - [480, 16, 1, 196] + - [1016, 241.331] + - - [1568, 256, 1, 289] + - [935, 4723.51] + - - [1152, 448, 1, 64] + - [941, 3356.82] + - - [512, 64, 1, 196] + - [980, 802.916] + - - [1344, 224, 1, 289] + - [935, 3519.73] + - - [9216, 512, 1, 4096] + - [958, 9146.12] + - - [27, 32, 1, 22201] + - [1026, 264.456] + - - [1152, 192, 1, 784] + - [965, 4904.18] + - - [1536, 256, 1, 64] + - [933, 2578.57] + - - [800, 128, 1, 196] + - [995, 1991.21] + - - [800, 64, 1, 196] + - [990, 1150.93] + - - [864, 208, 1, 196] + - [967, 2684.82] + - - [1440, 320, 1, 49] + - [936, 2313.54] + - - [512, 128, 1, 784] + - [986, 2780.42] + - - [720, 192, 1, 5041] + - [961, 5410.56] + - - [256, 64, 1, 784] + - [1018, 1163.6] + - - [256, 48, 1, 1225] + - [1013, 1075.3] + - - [576, 192, 1, 3136] + - [961, 4833.11] + - - [160, 64, 1, 5329] + - [1015, 1753.6] + - - [3456, 384, 1, 289] + - [955, 7341.85] + - - [32, 32, 36, 43808] + - [1009, 1378.13] + - - [1344, 512, 1, 64] + - [934, 3823.03] + - - [192, 16, 1, 784] + - [1014, 228.173] + - - [3456, 384, 1, 169] + - [951, 6675.12] + - - [1152, 256, 1, 196] + - [944, 3211.36] + - - [1728, 192, 1, 1225] + - [945, 4852.36] + - - [2048, 512, 1, 49] + - [957, 3471.74] + - - [576, 96, 1, 1225] + - [988, 2176.76] + - - [512, 2048, 1, 49] + - [939, 3845.93] + - - [1728, 192, 1, 64] + - [934, 2369.93] + - - [832, 256, 1, 49] + - [964, 1433.7] + - - [512, 128, 1, 196] + - [989, 1459.77] + - - [1200, 128, 1, 49] + - [984, 1069.19] + - - [528, 256, 1, 196] + - [972, 2069.86] + - - [256, 512, 1, 784] + - [995, 4538.99] + - - [480, 192, 1, 196] + - [995, 1792.1] + - - [96, 64, 36, 2592] + - [1002, 4845.51] + - - [96, 96, 36, 2592] + - [1007, 5111.63] + - - [1024, 192, 1, 289] + - [969, 3431.24] + - - [1536, 384, 1, 64] + - [940, 3166.94] + - - [192, 96, 1, 784] + - [980, 881.24] + - - [2048, 192, 1, 64] + - [937, 2330.27] + - - [192, 64, 1, 1225] + - [1019, 1100.45] + - - [512, 32, 1, 196] + - [1010, 477.967] + - - [128, 96, 36, 1568] + - [1006, 6649.19] + - - [528, 128, 1, 196] + - [992, 1403.33] + - - [128, 512, 1, 784] + - [982, 2237.91] + - - [128, 128, 36, 3136] + - [999, 6538.87] + - - [528, 160, 1, 196] + - [996, 1642.77] + - - [448, 64, 1, 5329] + - [971, 3264.91] + - - [1280, 320, 1, 64] + - [935, 2777.05] + - - [1792, 320, 1, 289] + - [947, 5205.0] + - - [2880, 320, 1, 64] + - [943, 4337.04] + - - [147, 64, 1, 12544] + - [1024, 2430.37] + - - [4096, 512, 1, 1001] + - [959, 9619.09] + - - [1536, 32, 1, 1001] + - [996, 1757.28] + - - [512, 160, 1, 196] + - [992, 1592.99] + - - [768, 160, 1, 289] + - [993, 2757.27] + - - [1728, 384, 1, 49] + - [945, 3102.59] + - - [64, 32, 36, 43808] + - [1000, 2626.53] + - - [64, 64, 1, 3136] + - [1012, 610.606] + - - [256, 32, 1, 784] + - [1013, 612.937] + - - [480, 96, 1, 196] + - [988, 1055.2] + - - [1024, 32, 1, 1001] + - [978, 1188.53] + - - [832, 160, 1, 49] + - [993, 959.347] + - - [512, 1024, 1, 196] + - [936, 4978.8] + - - [96, 64, 36, 10368] + - [1030, 5001.05] + - - [384, 448, 36, 512] + - [1035, 8903.1] + - - [2048, 64, 1, 1001] + - [1028, 4385.23] + - - [224, 192, 36, 5184] + - [1034, 7487.91] + - - [2048, 128, 1, 1001] + - [1027, 5764.73] + - - [96, 96, 36, 10368] + - [1036, 5275.31] + - - [192, 80, 36, 20736] + - [1032, 5409.5] + - - [96, 64, 36, 5184] + - [1030, 4911.93] + - - [1536, 64, 1, 1001] + - [1029, 3162.13] + - - [96, 64, 36, 20736] + - [1031, 5034.43] + - - [384, 448, 36, 256] + - [1033, 8815.97] + - - [96, 96, 36, 5184] + - [1037, 5236.12] - null diff --git a/scripts/performance/sgemm-resnet-inception.sh b/scripts/performance/sgemm-resnet-inception.sh new file mode 100644 index 000000000..7e68facfa --- /dev/null +++ b/scripts/performance/sgemm-resnet-inception.sh @@ -0,0 +1,375 @@ +#!/bin/bash + +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 3025 -n 64 -k 363 --alpha 1.0 --a_type f32_r --lda 3025 --b_type f32_r --ldb 363 --beta 0.0 --c_type f32_r --ldc 3025 --d_type f32_r --ldd 3025 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 729 -n 192 -k 1600 --alpha 1.0 --a_type f32_r --lda 729 --b_type f32_r --ldb 1600 --beta 0.0 --c_type f32_r --ldc 729 --d_type f32_r --ldd 729 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 12544 -n 64 -k 147 --alpha 1.0 --a_type f32_r --lda 12544 --b_type f32_r --ldb 147 --beta 0.0 --c_type f32_r --ldc 12544 --d_type f32_r --ldd 12544 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 1568 -n 128 -k 832 --alpha 1.0 --a_type f32_r --lda 1568 --b_type f32_r --ldb 832 --beta 0.0 --c_type f32_r --ldc 1568 --d_type f32_r --ldd 1568 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 1568 -n 160 -k 832 --alpha 1.0 --a_type f32_r --lda 1568 --b_type f32_r --ldb 832 --beta 0.0 --c_type f32_r --ldc 1568 --d_type f32_r --ldd 1568 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 1568 -n 192 -k 832 --alpha 1.0 --a_type f32_r --lda 1568 --b_type f32_r --ldb 832 --beta 0.0 --c_type f32_r --ldc 1568 --d_type f32_r --ldd 1568 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 1568 -n 256 -k 832 --alpha 1.0 --a_type f32_r --lda 1568 --b_type f32_r --ldb 832 --beta 0.0 --c_type f32_r --ldc 1568 --d_type f32_r --ldd 1568 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 1568 -n 32 -k 832 --alpha 1.0 --a_type f32_r --lda 1568 --b_type f32_r --ldb 832 --beta 0.0 --c_type f32_r --ldc 1568 --d_type f32_r --ldd 1568 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 1568 -n 384 -k 832 --alpha 1.0 --a_type f32_r --lda 1568 --b_type f32_r --ldb 832 --beta 0.0 --c_type f32_r --ldc 1568 --d_type f32_r --ldd 1568 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 1568 -n 48 -k 832 --alpha 1.0 --a_type f32_r --lda 1568 --b_type f32_r --ldb 832 --beta 0.0 --c_type f32_r --ldc 1568 --d_type f32_r --ldd 1568 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 196 -n 128 -k 800 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 800 --beta 0.0 --c_type f32_r --ldc 196 --d_type f32_r --ldd 196 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 196 -n 48 -k 400 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 400 --beta 0.0 --c_type f32_r --ldc 196 --d_type f32_r --ldd 196 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 196 -n 64 -k 600 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 600 --beta 0.0 --c_type f32_r --ldc 196 --d_type f32_r --ldd 196 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 196 -n 64 -k 800 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 800 --beta 0.0 --c_type f32_r --ldc 196 --d_type f32_r --ldd 196 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 49 -n 128 -k 1200 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 1200 --beta 0.0 --c_type f32_r --ldc 49 --d_type f32_r --ldd 49 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 49 -n 128 -k 800 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 800 --beta 0.0 --c_type f32_r --ldc 49 --d_type f32_r --ldd 49 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 112 -k 512 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 128 -k 512 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 128 -k 528 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 528 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 144 -k 512 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 16 -k 480 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 480 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 160 -k 512 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 160 -k 528 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 528 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 192 -k 480 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 480 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 24 -k 512 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 256 -k 528 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 528 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 32 -k 512 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 32 -k 528 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 528 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 64 -k 480 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 480 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 64 -k 512 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 96 -k 480 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 480 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 784 -n 32 -k 400 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 400 --beta 0.0 --c_type f32_r --ldc 784 --d_type f32_r --ldd 784 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 784 -n 96 -k 800 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 800 --beta 0.0 --c_type f32_r --ldc 784 --d_type f32_r --ldd 784 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 1225 -n 64 -k 1200 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1200 --beta 0.0 --c_type f32_r --ldc 1225 --d_type f32_r --ldd 1225 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 2048 -n 192 -k 1280 --alpha 1.0 --a_type f32_r --lda 2048 --b_type f32_r --ldb 1280 --beta 0.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 2048 -n 192 -k 2048 --alpha 1.0 --a_type f32_r --lda 2048 --b_type f32_r --ldb 2048 --beta 0.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 2048 -n 320 -k 1280 --alpha 1.0 --a_type f32_r --lda 2048 --b_type f32_r --ldb 1280 --beta 0.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 2048 -n 320 -k 2048 --alpha 1.0 --a_type f32_r --lda 2048 --b_type f32_r --ldb 2048 --beta 0.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 2048 -n 384 -k 1280 --alpha 1.0 --a_type f32_r --lda 2048 --b_type f32_r --ldb 1280 --beta 0.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 2048 -n 384 -k 2048 --alpha 1.0 --a_type f32_r --lda 2048 --b_type f32_r --ldb 2048 --beta 0.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 2048 -n 448 -k 1280 --alpha 1.0 --a_type f32_r --lda 2048 --b_type f32_r --ldb 1280 --beta 0.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 2048 -n 448 -k 2048 --alpha 1.0 --a_type f32_r --lda 2048 --b_type f32_r --ldb 2048 --beta 0.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 21609 -n 32 -k 288 --alpha 1.0 --a_type f32_r --lda 21609 --b_type f32_r --ldb 288 --beta 0.0 --c_type f32_r --ldc 21609 --d_type f32_r --ldd 21609 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 22201 -n 32 -k 27 --alpha 1.0 --a_type f32_r --lda 22201 --b_type f32_r --ldb 27 --beta 0.0 --c_type f32_r --ldc 22201 --d_type f32_r --ldd 22201 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 128 -k 896 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 896 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 160 -k 1120 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1120 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 192 -k 1120 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1120 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 192 -k 1344 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1344 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 192 -k 896 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 896 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 384 -k 2592 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 2592 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 96 -k 864 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 864 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 5041 -n 192 -k 720 --alpha 1.0 --a_type f32_r --lda 5041 --b_type f32_r --ldb 720 --beta 0.0 --c_type f32_r --ldc 5041 --d_type f32_r --ldd 5041 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 64 -n 192 -k 1728 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1728 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 64 -n 320 -k 1728 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1728 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 64 -n 384 -k 1152 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1152 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 1225 -n 192 -k 1728 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1728 --beta 0.0 --c_type f32_r --ldc 1225 --d_type f32_r --ldd 1225 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 2048 -n 256 -k 1536 --alpha 1.0 --a_type f32_r --lda 2048 --b_type f32_r --ldb 1536 --beta 0.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 2048 -n 384 -k 1536 --alpha 1.0 --a_type f32_r --lda 2048 --b_type f32_r --ldb 1536 --beta 0.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 224 -k 1344 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1344 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 224 -k 1568 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1568 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 256 -k 1568 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1568 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 256 -k 1792 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1792 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 256 -k 2016 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 2016 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 320 -k 1792 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1792 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 384 -k 3456 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 3456 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 5041 -n 96 -k 576 --alpha 1.0 --a_type f32_r --lda 5041 --b_type f32_r --ldb 576 --beta 0.0 --c_type f32_r --ldc 5041 --d_type f32_r --ldd 5041 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 5329 -n 64 -k 448 --alpha 1.0 --a_type f32_r --lda 5329 --b_type f32_r --ldb 448 --beta 0.0 --c_type f32_r --ldc 5329 --d_type f32_r --ldd 5329 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 5329 -n 96 -k 576 --alpha 1.0 --a_type f32_r --lda 5329 --b_type f32_r --ldb 576 --beta 0.0 --c_type f32_r --ldc 5329 --d_type f32_r --ldd 5329 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 64 -n 256 -k 1152 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1152 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 64 -n 256 -k 1536 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1536 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 64 -n 320 -k 2880 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 2880 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 64 -n 448 -k 1152 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1152 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 64 -n 512 -k 1344 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1344 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 12544 -n 1024 -k 256 --alpha 1.0 --a_type f32_r --lda 12544 --b_type f32_r --ldb 256 --beta 0.0 --c_type f32_r --ldc 12544 --d_type f32_r --ldd 12544 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 12544 -n 1024 -k 512 --alpha 1.0 --a_type f32_r --lda 12544 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 12544 --d_type f32_r --ldd 12544 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 12544 -n 256 -k 1024 --alpha 1.0 --a_type f32_r --lda 12544 --b_type f32_r --ldb 1024 --beta 0.0 --c_type f32_r --ldc 12544 --d_type f32_r --ldd 12544 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 12544 -n 256 -k 512 --alpha 1.0 --a_type f32_r --lda 12544 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 12544 --d_type f32_r --ldd 12544 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 3136 -n 2048 -k 1024 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 1024 --beta 0.0 --c_type f32_r --ldc 3136 --d_type f32_r --ldd 3136 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 3136 -n 2048 -k 512 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 3136 --d_type f32_r --ldd 3136 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 3136 -n 512 -k 1024 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 1024 --beta 0.0 --c_type f32_r --ldc 3136 --d_type f32_r --ldd 3136 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 3136 -n 512 -k 2048 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 2048 --beta 0.0 --c_type f32_r --ldc 3136 --d_type f32_r --ldd 3136 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 50176 -n 128 -k 256 --alpha 1.0 --a_type f32_r --lda 50176 --b_type f32_r --ldb 256 --beta 0.0 --c_type f32_r --ldc 50176 --d_type f32_r --ldd 50176 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 50176 -n 512 -k 256 --alpha 1.0 --a_type f32_r --lda 50176 --b_type f32_r --ldb 256 --beta 0.0 --c_type f32_r --ldc 50176 --d_type f32_r --ldd 50176 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 729 -n 1600 -k 192 --alpha 1.0 --a_type f32_r --lda 729 --b_type f32_r --ldb 1600 --beta 0.0 --c_type f32_r --ldc 729 --d_type f32_r --ldd 729 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 196 -n 400 -k 48 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 400 --beta 0.0 --c_type f32_r --ldc 196 --d_type f32_r --ldd 196 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 196 -n 600 -k 64 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 600 --beta 0.0 --c_type f32_r --ldc 196 --d_type f32_r --ldd 196 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 196 -n 800 -k 128 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 800 --beta 0.0 --c_type f32_r --ldc 196 --d_type f32_r --ldd 196 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 196 -n 800 -k 64 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 800 --beta 0.0 --c_type f32_r --ldc 196 --d_type f32_r --ldd 196 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 49 -n 1200 -k 128 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 1200 --beta 0.0 --c_type f32_r --ldc 49 --d_type f32_r --ldd 49 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 49 -n 800 -k 128 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 800 --beta 0.0 --c_type f32_r --ldc 49 --d_type f32_r --ldd 49 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 784 -n 400 -k 32 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 400 --beta 0.0 --c_type f32_r --ldc 784 --d_type f32_r --ldd 784 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 784 -n 800 -k 96 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 800 --beta 0.0 --c_type f32_r --ldc 784 --d_type f32_r --ldd 784 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 1225 -n 1200 -k 64 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1200 --beta 0.0 --c_type f32_r --ldc 1225 --d_type f32_r --ldd 1225 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 21609 -n 288 -k 32 --alpha 1.0 --a_type f32_r --lda 21609 --b_type f32_r --ldb 288 --beta 0.0 --c_type f32_r --ldc 21609 --d_type f32_r --ldd 21609 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 1120 -k 160 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1120 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 1120 -k 192 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1120 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 1344 -k 192 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1344 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 2592 -k 384 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 2592 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 864 -k 96 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 864 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 896 -k 128 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 896 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 896 -k 192 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 896 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 5041 -n 720 -k 192 --alpha 1.0 --a_type f32_r --lda 5041 --b_type f32_r --ldb 720 --beta 0.0 --c_type f32_r --ldc 5041 --d_type f32_r --ldd 5041 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 64 -n 1152 -k 384 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1152 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 64 -n 1728 -k 192 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1728 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 64 -n 1728 -k 320 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1728 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 1225 -n 1728 -k 192 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1728 --beta 0.0 --c_type f32_r --ldc 1225 --d_type f32_r --ldd 1225 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 1344 -k 224 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1344 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 1568 -k 224 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1568 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 1568 -k 256 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1568 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 1792 -k 256 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1792 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 1792 -k 320 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1792 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 2016 -k 256 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 2016 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 3456 -k 384 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 3456 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 5041 -n 576 -k 96 --alpha 1.0 --a_type f32_r --lda 5041 --b_type f32_r --ldb 576 --beta 0.0 --c_type f32_r --ldc 5041 --d_type f32_r --ldd 5041 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 5329 -n 448 -k 64 --alpha 1.0 --a_type f32_r --lda 5329 --b_type f32_r --ldb 448 --beta 0.0 --c_type f32_r --ldc 5329 --d_type f32_r --ldd 5329 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 5329 -n 576 -k 96 --alpha 1.0 --a_type f32_r --lda 5329 --b_type f32_r --ldb 576 --beta 0.0 --c_type f32_r --ldc 5329 --d_type f32_r --ldd 5329 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 64 -n 1152 -k 256 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1152 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 64 -n 1152 -k 448 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1152 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 64 -n 1344 -k 512 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1344 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 64 -n 1536 -k 256 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1536 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 64 -n 2880 -k 320 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 2880 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 12544 -n 512 -k 1024 --alpha 1.0 --a_type f32_r --lda 12544 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 12544 --d_type f32_r --ldd 12544 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 12544 -n 512 -k 256 --alpha 1.0 --a_type f32_r --lda 12544 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 12544 --d_type f32_r --ldd 12544 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 3136 -n 1024 -k 2048 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 1024 --beta 0.0 --c_type f32_r --ldc 3136 --d_type f32_r --ldd 3136 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 3136 -n 1024 -k 512 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 1024 --beta 0.0 --c_type f32_r --ldc 3136 --d_type f32_r --ldd 3136 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 50176 -n 256 -k 128 --alpha 1.0 --a_type f32_r --lda 50176 --b_type f32_r --ldb 256 --beta 0.0 --c_type f32_r --ldc 50176 --d_type f32_r --ldd 50176 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 50176 -n 256 -k 512 --alpha 1.0 --a_type f32_r --lda 50176 --b_type f32_r --ldb 256 --beta 0.0 --c_type f32_r --ldc 50176 --d_type f32_r --ldd 50176 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 3136 -n 64 -k 64 --alpha 1.0 --a_type f32_r --lda 3136 --stride_a 200704 --b_type f32_r --ldb 64 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 3136 --stride_c 200704 --d_type f32_r --ldd 3136 --stride_d 200704 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 784 -n 128 -k 256 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 200704 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 100352 --d_type f32_r --ldd 784 --stride_d 100352 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 784 -n 16 -k 192 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 150528 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 12544 --d_type f32_r --ldd 784 --stride_d 12544 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 784 -n 32 -k 192 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 150528 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 25088 --d_type f32_r --ldd 784 --stride_d 25088 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 784 -n 32 -k 256 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 200704 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 25088 --d_type f32_r --ldd 784 --stride_d 25088 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 784 -n 64 -k 192 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 150528 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 50176 --d_type f32_r --ldd 784 --stride_d 50176 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 784 -n 64 -k 256 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 200704 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 50176 --d_type f32_r --ldd 784 --stride_d 50176 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 784 -n 96 -k 192 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 150528 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 75264 --d_type f32_r --ldd 784 --stride_d 75264 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 1225 -n 32 -k 192 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 235200 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 39200 --d_type f32_r --ldd 1225 --stride_d 39200 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 1225 -n 48 -k 192 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 235200 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 58800 --d_type f32_r --ldd 1225 --stride_d 58800 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 1225 -n 48 -k 256 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 313600 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 58800 --d_type f32_r --ldd 1225 --stride_d 58800 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 1225 -n 48 -k 288 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 352800 --b_type f32_r --ldb 288 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 58800 --d_type f32_r --ldd 1225 --stride_d 58800 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 1225 -n 64 -k 192 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 235200 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 78400 --d_type f32_r --ldd 1225 --stride_d 78400 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 1225 -n 64 -k 256 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 313600 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 78400 --d_type f32_r --ldd 1225 --stride_d 78400 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 1225 -n 64 -k 288 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 352800 --b_type f32_r --ldb 288 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 78400 --d_type f32_r --ldd 1225 --stride_d 78400 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 289 -n 128 -k 768 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 221952 --b_type f32_r --ldb 768 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 36992 --d_type f32_r --ldd 289 --stride_d 36992 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 289 -n 160 -k 768 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 221952 --b_type f32_r --ldb 768 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 46240 --d_type f32_r --ldd 289 --stride_d 46240 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 289 -n 192 -k 768 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 221952 --b_type f32_r --ldb 768 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 55488 --d_type f32_r --ldd 289 --stride_d 55488 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 5329 -n 80 -k 64 --alpha 1.0 --a_type f32_r --lda 5329 --stride_a 341056 --b_type f32_r --ldb 64 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 5329 --stride_c 426320 --d_type f32_r --ldd 5329 --stride_d 426320 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 1225 -n 192 -k 384 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 470400 --b_type f32_r --ldb 384 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 235200 --d_type f32_r --ldd 1225 --stride_d 235200 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 1225 -n 64 -k 384 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 470400 --b_type f32_r --ldb 384 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 78400 --d_type f32_r --ldd 1225 --stride_d 78400 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 1225 -n 96 -k 384 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 470400 --b_type f32_r --ldb 384 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 117600 --d_type f32_r --ldd 1225 --stride_d 117600 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 289 -n 128 -k 1024 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 295936 --b_type f32_r --ldb 1024 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 36992 --d_type f32_r --ldd 289 --stride_d 36992 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 289 -n 192 -k 1024 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 295936 --b_type f32_r --ldb 1024 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 55488 --d_type f32_r --ldd 289 --stride_d 55488 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 289 -n 256 -k 1024 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 295936 --b_type f32_r --ldb 1024 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 73984 --d_type f32_r --ldd 289 --stride_d 73984 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 289 -n 384 -k 1024 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 295936 --b_type f32_r --ldb 1024 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 110976 --d_type f32_r --ldd 289 --stride_d 110976 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 5329 -n 64 -k 160 --alpha 1.0 --a_type f32_r --lda 5329 --stride_a 852640 --b_type f32_r --ldb 160 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 5329 --stride_c 341056 --d_type f32_r --ldd 5329 --stride_d 341056 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 3136 -n 256 -k 64 --alpha 1.0 --a_type f32_r --lda 3136 --stride_a 200704 --b_type f32_r --ldb 64 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 3136 --stride_c 802816 --d_type f32_r --ldd 3136 --stride_d 802816 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 3136 -n 64 -k 256 --alpha 1.0 --a_type f32_r --lda 3136 --stride_a 802816 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 3136 --stride_c 200704 --d_type f32_r --ldd 3136 --stride_d 200704 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 3136 -n 64 -k 64 --alpha 1.0 --a_type f32_r --lda 3136 --stride_a 200704 --b_type f32_r --ldb 64 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 3136 --stride_c 200704 --d_type f32_r --ldd 3136 --stride_d 200704 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 784 -n 128 -k 512 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 401408 --b_type f32_r --ldb 512 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 100352 --d_type f32_r --ldd 784 --stride_d 100352 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 784 -n 512 -k 128 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 100352 --b_type f32_r --ldb 128 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 401408 --d_type f32_r --ldd 784 --stride_d 401408 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 480 -k 16 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 3136 --b_type f32_r --ldb 480 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 94080 --d_type f32_r --ldd 196 --stride_d 94080 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 480 -k 192 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 37632 --b_type f32_r --ldb 480 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 94080 --d_type f32_r --ldd 196 --stride_d 94080 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 480 -k 64 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 12544 --b_type f32_r --ldb 480 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 94080 --d_type f32_r --ldd 196 --stride_d 94080 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 480 -k 96 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 18816 --b_type f32_r --ldb 480 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 94080 --d_type f32_r --ldd 196 --stride_d 94080 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 512 -k 112 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 21952 --b_type f32_r --ldb 512 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 100352 --d_type f32_r --ldd 196 --stride_d 100352 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 512 -k 128 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 25088 --b_type f32_r --ldb 512 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 100352 --d_type f32_r --ldd 196 --stride_d 100352 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 512 -k 144 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 28224 --b_type f32_r --ldb 512 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 100352 --d_type f32_r --ldd 196 --stride_d 100352 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 512 -k 160 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 31360 --b_type f32_r --ldb 512 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 100352 --d_type f32_r --ldd 196 --stride_d 100352 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 512 -k 24 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 4704 --b_type f32_r --ldb 512 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 100352 --d_type f32_r --ldd 196 --stride_d 100352 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 512 -k 32 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 6272 --b_type f32_r --ldb 512 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 100352 --d_type f32_r --ldd 196 --stride_d 100352 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 512 -k 64 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 12544 --b_type f32_r --ldb 512 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 100352 --d_type f32_r --ldd 196 --stride_d 100352 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 528 -k 128 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 25088 --b_type f32_r --ldb 528 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 103488 --d_type f32_r --ldd 196 --stride_d 103488 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 528 -k 160 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 31360 --b_type f32_r --ldb 528 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 103488 --d_type f32_r --ldd 196 --stride_d 103488 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 528 -k 256 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 50176 --b_type f32_r --ldb 528 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 103488 --d_type f32_r --ldd 196 --stride_d 103488 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 528 -k 32 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 6272 --b_type f32_r --ldb 528 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 103488 --d_type f32_r --ldd 196 --stride_d 103488 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 3136 -n 64 -k 64 --alpha 1.0 --a_type f32_r --lda 3136 --stride_a 200704 --b_type f32_r --ldb 64 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 3136 --stride_c 200704 --d_type f32_r --ldd 3136 --stride_d 200704 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 49 -n 832 -k 128 --alpha 1.0 --a_type f32_r --lda 49 --stride_a 6272 --b_type f32_r --ldb 832 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 49 --stride_c 40768 --d_type f32_r --ldd 49 --stride_d 40768 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 49 -n 832 -k 160 --alpha 1.0 --a_type f32_r --lda 49 --stride_a 7840 --b_type f32_r --ldb 832 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 49 --stride_c 40768 --d_type f32_r --ldd 49 --stride_d 40768 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 49 -n 832 -k 192 --alpha 1.0 --a_type f32_r --lda 49 --stride_a 9408 --b_type f32_r --ldb 832 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 49 --stride_c 40768 --d_type f32_r --ldd 49 --stride_d 40768 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 49 -n 832 -k 256 --alpha 1.0 --a_type f32_r --lda 49 --stride_a 12544 --b_type f32_r --ldb 832 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 49 --stride_c 40768 --d_type f32_r --ldd 49 --stride_d 40768 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 49 -n 832 -k 32 --alpha 1.0 --a_type f32_r --lda 49 --stride_a 1568 --b_type f32_r --ldb 832 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 49 --stride_c 40768 --d_type f32_r --ldd 49 --stride_d 40768 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 49 -n 832 -k 384 --alpha 1.0 --a_type f32_r --lda 49 --stride_a 18816 --b_type f32_r --ldb 832 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 49 --stride_c 40768 --d_type f32_r --ldd 49 --stride_d 40768 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 49 -n 832 -k 48 --alpha 1.0 --a_type f32_r --lda 49 --stride_a 2352 --b_type f32_r --ldb 832 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 49 --stride_c 40768 --d_type f32_r --ldd 49 --stride_d 40768 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 784 -n 192 -k 16 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 12544 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 150528 --d_type f32_r --ldd 784 --stride_d 150528 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 784 -n 192 -k 32 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 25088 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 150528 --d_type f32_r --ldd 784 --stride_d 150528 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 784 -n 192 -k 64 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 50176 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 150528 --d_type f32_r --ldd 784 --stride_d 150528 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 784 -n 192 -k 96 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 75264 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 150528 --d_type f32_r --ldd 784 --stride_d 150528 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 784 -n 256 -k 128 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 100352 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 200704 --d_type f32_r --ldd 784 --stride_d 200704 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 784 -n 256 -k 32 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 25088 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 200704 --d_type f32_r --ldd 784 --stride_d 200704 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 784 -n 256 -k 64 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 50176 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 200704 --d_type f32_r --ldd 784 --stride_d 200704 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1225 -n 192 -k 32 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 39200 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 235200 --d_type f32_r --ldd 1225 --stride_d 235200 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1225 -n 192 -k 48 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 58800 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 235200 --d_type f32_r --ldd 1225 --stride_d 235200 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1225 -n 192 -k 64 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 78400 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 235200 --d_type f32_r --ldd 1225 --stride_d 235200 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1225 -n 256 -k 48 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 58800 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 313600 --d_type f32_r --ldd 1225 --stride_d 313600 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1225 -n 256 -k 64 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 78400 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 313600 --d_type f32_r --ldd 1225 --stride_d 313600 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1225 -n 288 -k 48 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 58800 --b_type f32_r --ldb 288 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 352800 --d_type f32_r --ldd 1225 --stride_d 352800 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1225 -n 288 -k 64 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 78400 --b_type f32_r --ldb 288 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 352800 --d_type f32_r --ldd 1225 --stride_d 352800 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 289 -n 768 -k 128 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 36992 --b_type f32_r --ldb 768 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 221952 --d_type f32_r --ldd 289 --stride_d 221952 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 289 -n 768 -k 160 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 46240 --b_type f32_r --ldb 768 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 221952 --d_type f32_r --ldd 289 --stride_d 221952 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 289 -n 768 -k 192 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 55488 --b_type f32_r --ldb 768 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 221952 --d_type f32_r --ldd 289 --stride_d 221952 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 5329 -n 64 -k 80 --alpha 1.0 --a_type f32_r --lda 5329 --stride_a 426320 --b_type f32_r --ldb 64 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 5329 --stride_c 341056 --d_type f32_r --ldd 5329 --stride_d 341056 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 64 -n 1280 -k 192 --alpha 1.0 --a_type f32_r --lda 64 --stride_a 12288 --b_type f32_r --ldb 1280 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 81920 --d_type f32_r --ldd 64 --stride_d 81920 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 64 -n 1280 -k 320 --alpha 1.0 --a_type f32_r --lda 64 --stride_a 20480 --b_type f32_r --ldb 1280 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 81920 --d_type f32_r --ldd 64 --stride_d 81920 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 64 -n 1280 -k 384 --alpha 1.0 --a_type f32_r --lda 64 --stride_a 24576 --b_type f32_r --ldb 1280 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 81920 --d_type f32_r --ldd 64 --stride_d 81920 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 64 -n 1280 -k 448 --alpha 1.0 --a_type f32_r --lda 64 --stride_a 28672 --b_type f32_r --ldb 1280 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 81920 --d_type f32_r --ldd 64 --stride_d 81920 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 64 -n 2048 -k 192 --alpha 1.0 --a_type f32_r --lda 64 --stride_a 12288 --b_type f32_r --ldb 2048 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 131072 --d_type f32_r --ldd 64 --stride_d 131072 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 64 -n 2048 -k 320 --alpha 1.0 --a_type f32_r --lda 64 --stride_a 20480 --b_type f32_r --ldb 2048 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 131072 --d_type f32_r --ldd 64 --stride_d 131072 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 64 -n 2048 -k 384 --alpha 1.0 --a_type f32_r --lda 64 --stride_a 24576 --b_type f32_r --ldb 2048 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 131072 --d_type f32_r --ldd 64 --stride_d 131072 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 64 -n 2048 -k 448 --alpha 1.0 --a_type f32_r --lda 64 --stride_a 28672 --b_type f32_r --ldb 2048 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 131072 --d_type f32_r --ldd 64 --stride_d 131072 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1225 -n 384 -k 192 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 235200 --b_type f32_r --ldb 384 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 470400 --d_type f32_r --ldd 1225 --stride_d 470400 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1225 -n 384 -k 64 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 78400 --b_type f32_r --ldb 384 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 470400 --d_type f32_r --ldd 1225 --stride_d 470400 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1225 -n 384 -k 96 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 117600 --b_type f32_r --ldb 384 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 470400 --d_type f32_r --ldd 1225 --stride_d 470400 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 289 -n 1024 -k 128 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 36992 --b_type f32_r --ldb 1024 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 295936 --d_type f32_r --ldd 289 --stride_d 295936 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 289 -n 1024 -k 192 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 55488 --b_type f32_r --ldb 1024 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 295936 --d_type f32_r --ldd 289 --stride_d 295936 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 289 -n 1024 -k 256 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 73984 --b_type f32_r --ldb 1024 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 295936 --d_type f32_r --ldd 289 --stride_d 295936 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 289 -n 1024 -k 384 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 110976 --b_type f32_r --ldb 1024 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 295936 --d_type f32_r --ldd 289 --stride_d 295936 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 5329 -n 160 -k 64 --alpha 1.0 --a_type f32_r --lda 5329 --stride_a 341056 --b_type f32_r --ldb 160 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 5329 --stride_c 852640 --d_type f32_r --ldd 5329 --stride_d 852640 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 64 -n 1536 -k 256 --alpha 1.0 --a_type f32_r --lda 64 --stride_a 16384 --b_type f32_r --ldb 1536 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 98304 --d_type f32_r --ldd 64 --stride_d 98304 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 64 -n 1536 -k 384 --alpha 1.0 --a_type f32_r --lda 64 --stride_a 24576 --b_type f32_r --ldb 1536 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 98304 --d_type f32_r --ldd 64 --stride_d 98304 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 1024 -k 256 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 50176 --b_type f32_r --ldb 1024 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 200704 --d_type f32_r --ldd 196 --stride_d 200704 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 256 -k 1024 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 200704 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 50176 --d_type f32_r --ldd 196 --stride_d 50176 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 3136 -n 256 -k 64 --alpha 1.0 --a_type f32_r --lda 3136 --stride_a 200704 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 3136 --stride_c 802816 --d_type f32_r --ldd 3136 --stride_d 802816 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 3136 -n 64 -k 256 --alpha 1.0 --a_type f32_r --lda 3136 --stride_a 802816 --b_type f32_r --ldb 64 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 3136 --stride_c 200704 --d_type f32_r --ldd 3136 --stride_d 200704 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 3136 -n 64 -k 64 --alpha 1.0 --a_type f32_r --lda 3136 --stride_a 200704 --b_type f32_r --ldb 64 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 3136 --stride_c 200704 --d_type f32_r --ldd 3136 --stride_d 200704 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 49 -n 2048 -k 512 --alpha 1.0 --a_type f32_r --lda 49 --stride_a 25088 --b_type f32_r --ldb 2048 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 49 --stride_c 100352 --d_type f32_r --ldd 49 --stride_d 100352 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 49 -n 512 -k 2048 --alpha 1.0 --a_type f32_r --lda 49 --stride_a 100352 --b_type f32_r --ldb 512 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 49 --stride_c 25088 --d_type f32_r --ldd 49 --stride_d 25088 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 784 -n 128 -k 512 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 401408 --b_type f32_r --ldb 128 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 100352 --d_type f32_r --ldd 784 --stride_d 100352 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 784 -n 512 -k 128 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 100352 --b_type f32_r --ldb 512 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 401408 --d_type f32_r --ldd 784 --stride_d 401408 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 128 -n 96 -k 1568 --alpha 1.0 --a_type f32_r --lda 1568 --stride_a 200704 --b_type f32_r --ldb 1568 --stride_b 150528 --beta 0.0 --c_type f32_r --ldc 128 --stride_c 12288 --d_type f32_r --ldd 128 --stride_d 12288 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 192 -n 128 -k 1568 --alpha 1.0 --a_type f32_r --lda 1568 --stride_a 301056 --b_type f32_r --ldb 1568 --stride_b 200704 --beta 0.0 --c_type f32_r --ldc 192 --stride_c 24576 --d_type f32_r --ldd 192 --stride_d 24576 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 192 -n 64 -k 6272 --alpha 1.0 --a_type f32_r --lda 6272 --stride_a 1204224 --b_type f32_r --ldb 6272 --stride_b 401408 --beta 0.0 --c_type f32_r --ldc 192 --stride_c 12288 --d_type f32_r --ldd 192 --stride_d 12288 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 192 -n 80 -k 10368 --alpha 1.0 --a_type f32_r --lda 10368 --stride_a 1990656 --b_type f32_r --ldb 10368 --stride_b 829440 --beta 0.0 --c_type f32_r --ldc 192 --stride_c 15360 --d_type f32_r --ldd 192 --stride_d 15360 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 32 -n 32 -k 43808 --alpha 1.0 --a_type f32_r --lda 43808 --stride_a 1401856 --b_type f32_r --ldb 43808 --stride_b 1401856 --beta 0.0 --c_type f32_r --ldc 32 --stride_c 1024 --d_type f32_r --ldd 32 --stride_d 1024 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 64 -n 32 -k 43808 --alpha 1.0 --a_type f32_r --lda 43808 --stride_a 2803712 --b_type f32_r --ldb 43808 --stride_b 1401856 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 2048 --d_type f32_r --ldd 64 --stride_d 2048 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 96 -n 64 -k 2592 --alpha 1.0 --a_type f32_r --lda 2592 --stride_a 248832 --b_type f32_r --ldb 2592 --stride_b 165888 --beta 0.0 --c_type f32_r --ldc 96 --stride_c 6144 --d_type f32_r --ldd 96 --stride_d 6144 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 96 -n 96 -k 2592 --alpha 1.0 --a_type f32_r --lda 2592 --stride_a 248832 --b_type f32_r --ldb 2592 --stride_b 248832 --beta 0.0 --c_type f32_r --ldc 96 --stride_c 9216 --d_type f32_r --ldd 96 --stride_d 9216 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 224 -n 192 -k 2592 --alpha 1.0 --a_type f32_r --lda 2592 --stride_a 580608 --b_type f32_r --ldb 2592 --stride_b 497664 --beta 0.0 --c_type f32_r --ldc 224 --stride_c 43008 --d_type f32_r --ldd 224 --stride_d 43008 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 96 -n 64 -k 10368 --alpha 1.0 --a_type f32_r --lda 10368 --stride_a 995328 --b_type f32_r --ldb 10368 --stride_b 663552 --beta 0.0 --c_type f32_r --ldc 96 --stride_c 6144 --d_type f32_r --ldd 96 --stride_d 6144 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 128 -n 128 -k 3136 --alpha 1.0 --a_type f32_r --lda 3136 --stride_a 401408 --b_type f32_r --ldb 3136 --stride_b 401408 --beta 0.0 --c_type f32_r --ldc 128 --stride_c 16384 --d_type f32_r --ldd 128 --stride_d 16384 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 64 -n 64 -k 12544 --alpha 1.0 --a_type f32_r --lda 12544 --stride_a 802816 --b_type f32_r --ldb 12544 --stride_b 802816 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 4096 --d_type f32_r --ldd 64 --stride_d 4096 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1600 -n 192 -k 729 --alpha 1.0 --a_type f32_r --lda 729 --b_type f32_r --ldb 729 --beta 1.0 --c_type f32_r --ldc 1600 --d_type f32_r --ldd 1600 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1728 -n 384 -k 169 --alpha 1.0 --a_type f32_r --lda 169 --b_type f32_r --ldb 169 --beta 1.0 --c_type f32_r --ldc 1728 --d_type f32_r --ldd 1728 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 3456 -n 256 -k 169 --alpha 1.0 --a_type f32_r --lda 169 --b_type f32_r --ldb 169 --beta 1.0 --c_type f32_r --ldc 3456 --d_type f32_r --ldd 3456 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 3456 -n 384 -k 169 --alpha 1.0 --a_type f32_r --lda 169 --b_type f32_r --ldb 169 --beta 1.0 --c_type f32_r --ldc 3456 --d_type f32_r --ldd 3456 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 363 -n 64 -k 3025 --alpha 1.0 --a_type f32_r --lda 3025 --b_type f32_r --ldb 3025 --beta 1.0 --c_type f32_r --ldc 363 --d_type f32_r --ldd 363 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1008 -n 224 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 1008 --d_type f32_r --ldd 1008 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1152 -n 192 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 1152 --d_type f32_r --ldd 1152 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1152 -n 256 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 1152 --d_type f32_r --ldd 1152 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1200 -n 128 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 1200 --d_type f32_r --ldd 1200 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1296 -n 288 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 1296 --d_type f32_r --ldd 1296 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1440 -n 320 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 1440 --d_type f32_r --ldd 1440 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1440 -n 320 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 1440 --d_type f32_r --ldd 1440 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 147 -n 64 -k 12544 --alpha 1.0 --a_type f32_r --lda 12544 --b_type f32_r --ldb 12544 --beta 1.0 --c_type f32_r --ldc 147 --d_type f32_r --ldd 147 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1728 -n 384 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 1728 --d_type f32_r --ldd 1728 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 192 -n 16 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 192 --d_type f32_r --ldd 192 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 192 -n 32 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 192 --d_type f32_r --ldd 192 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 192 -n 64 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 192 --d_type f32_r --ldd 192 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 192 -n 96 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 192 --d_type f32_r --ldd 192 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 256 -n 128 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 256 --d_type f32_r --ldd 256 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 256 -n 32 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 256 --d_type f32_r --ldd 256 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 256 -n 64 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 256 --d_type f32_r --ldd 256 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 400 -n 32 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 400 --d_type f32_r --ldd 400 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 400 -n 48 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 400 --d_type f32_r --ldd 400 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 480 -n 16 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 480 --d_type f32_r --ldd 480 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 480 -n 192 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 480 --d_type f32_r --ldd 480 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 480 -n 64 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 480 --d_type f32_r --ldd 480 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 480 -n 96 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 480 --d_type f32_r --ldd 480 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 112 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 128 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 144 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 160 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 24 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 32 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 64 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 528 -n 128 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 528 --d_type f32_r --ldd 528 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 528 -n 160 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 528 --d_type f32_r --ldd 528 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 528 -n 256 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 528 --d_type f32_r --ldd 528 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 528 -n 32 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 528 --d_type f32_r --ldd 528 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 576 -n 192 -k 3136 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 3136 --beta 1.0 --c_type f32_r --ldc 576 --d_type f32_r --ldd 576 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 600 -n 64 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 600 --d_type f32_r --ldd 600 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 64 -n 64 -k 3136 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 3136 --beta 1.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 800 -n 128 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 800 --d_type f32_r --ldd 800 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 800 -n 128 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 800 --d_type f32_r --ldd 800 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 800 -n 64 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 800 --d_type f32_r --ldd 800 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 800 -n 96 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 800 --d_type f32_r --ldd 800 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 832 -n 128 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 832 --d_type f32_r --ldd 832 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 832 -n 160 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 832 --d_type f32_r --ldd 832 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 832 -n 192 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 832 --d_type f32_r --ldd 832 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 832 -n 256 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 832 --d_type f32_r --ldd 832 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 832 -n 32 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 832 --d_type f32_r --ldd 832 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 832 -n 384 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 832 --d_type f32_r --ldd 832 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 832 -n 48 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 832 --d_type f32_r --ldd 832 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 864 -n 128 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 864 --d_type f32_r --ldd 864 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 864 -n 208 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 864 --d_type f32_r --ldd 864 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1120 -n 160 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1120 --d_type f32_r --ldd 1120 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1120 -n 192 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1120 --d_type f32_r --ldd 1120 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1152 -n 384 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1152 --d_type f32_r --ldd 1152 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1200 -n 64 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 1200 --d_type f32_r --ldd 1200 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1280 -n 192 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1280 --d_type f32_r --ldd 1280 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1280 -n 320 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1280 --d_type f32_r --ldd 1280 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1280 -n 384 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1280 --d_type f32_r --ldd 1280 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1280 -n 448 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1280 --d_type f32_r --ldd 1280 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1344 -n 192 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1344 --d_type f32_r --ldd 1344 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1728 -n 192 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1728 --d_type f32_r --ldd 1728 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1728 -n 320 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1728 --d_type f32_r --ldd 1728 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 192 -n 32 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 192 --d_type f32_r --ldd 192 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 192 -n 48 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 192 --d_type f32_r --ldd 192 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 192 -n 64 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 192 --d_type f32_r --ldd 192 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 2048 -n 192 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 2048 -n 320 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 2048 -n 384 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 2048 -n 448 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 256 -n 48 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 256 --d_type f32_r --ldd 256 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 256 -n 64 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 256 --d_type f32_r --ldd 256 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 2592 -n 384 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 2592 --d_type f32_r --ldd 2592 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 27 -n 32 -k 22201 --alpha 1.0 --a_type f32_r --lda 22201 --b_type f32_r --ldb 22201 --beta 1.0 --c_type f32_r --ldc 27 --d_type f32_r --ldd 27 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 288 -n 32 -k 21609 --alpha 1.0 --a_type f32_r --lda 21609 --b_type f32_r --ldb 21609 --beta 1.0 --c_type f32_r --ldc 288 --d_type f32_r --ldd 288 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 288 -n 48 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 288 --d_type f32_r --ldd 288 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 288 -n 64 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 288 --d_type f32_r --ldd 288 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 288 -n 64 -k 21609 --alpha 1.0 --a_type f32_r --lda 21609 --b_type f32_r --ldb 21609 --beta 1.0 --c_type f32_r --ldc 288 --d_type f32_r --ldd 288 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 4032 -n 384 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 4032 --d_type f32_r --ldd 4032 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 576 -n 96 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 576 --d_type f32_r --ldd 576 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 64 -n 80 -k 5329 --alpha 1.0 --a_type f32_r --lda 5329 --b_type f32_r --ldb 5329 --beta 1.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 720 -n 192 -k 5041 --alpha 1.0 --a_type f32_r --lda 5041 --b_type f32_r --ldb 5041 --beta 1.0 --c_type f32_r --ldc 720 --d_type f32_r --ldd 720 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 768 -n 128 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 768 --d_type f32_r --ldd 768 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 768 -n 160 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 768 --d_type f32_r --ldd 768 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 768 -n 192 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 768 --d_type f32_r --ldd 768 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 864 -n 96 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 864 --d_type f32_r --ldd 864 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 864 -n 96 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 864 --d_type f32_r --ldd 864 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 896 -n 128 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 896 --d_type f32_r --ldd 896 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 896 -n 192 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 896 --d_type f32_r --ldd 896 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1024 -n 128 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1024 --d_type f32_r --ldd 1024 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1024 -n 192 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1024 --d_type f32_r --ldd 1024 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1024 -n 256 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1024 --d_type f32_r --ldd 1024 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1024 -n 384 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1024 --d_type f32_r --ldd 1024 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1152 -n 256 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1152 --d_type f32_r --ldd 1152 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1152 -n 448 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1152 --d_type f32_r --ldd 1152 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1344 -n 224 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1344 --d_type f32_r --ldd 1344 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1344 -n 512 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1344 --d_type f32_r --ldd 1344 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1536 -n 256 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1536 --d_type f32_r --ldd 1536 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1536 -n 384 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1536 --d_type f32_r --ldd 1536 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1568 -n 224 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1568 --d_type f32_r --ldd 1568 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1568 -n 256 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1568 --d_type f32_r --ldd 1568 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 160 -n 64 -k 5329 --alpha 1.0 --a_type f32_r --lda 5329 --b_type f32_r --ldb 5329 --beta 1.0 --c_type f32_r --ldc 160 --d_type f32_r --ldd 160 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1728 -n 192 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 1728 --d_type f32_r --ldd 1728 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1728 -n 224 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 1728 --d_type f32_r --ldd 1728 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1792 -n 256 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1792 --d_type f32_r --ldd 1792 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1792 -n 320 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1792 --d_type f32_r --ldd 1792 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 2016 -n 256 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 2016 --d_type f32_r --ldd 2016 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 2880 -n 320 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 2880 --d_type f32_r --ldd 2880 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 3456 -n 384 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 3456 --d_type f32_r --ldd 3456 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 384 -n 192 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 384 --d_type f32_r --ldd 384 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 384 -n 64 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 384 --d_type f32_r --ldd 384 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 384 -n 96 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 384 --d_type f32_r --ldd 384 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 448 -n 64 -k 5329 --alpha 1.0 --a_type f32_r --lda 5329 --b_type f32_r --ldb 5329 --beta 1.0 --c_type f32_r --ldc 448 --d_type f32_r --ldd 448 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 576 -n 96 -k 5041 --alpha 1.0 --a_type f32_r --lda 5041 --b_type f32_r --ldb 5041 --beta 1.0 --c_type f32_r --ldc 576 --d_type f32_r --ldd 576 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 576 -n 96 -k 5329 --alpha 1.0 --a_type f32_r --lda 5329 --b_type f32_r --ldb 5329 --beta 1.0 --c_type f32_r --ldc 576 --d_type f32_r --ldd 576 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1024 -n 2048 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 1024 --d_type f32_r --ldd 1024 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1024 -n 256 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 1024 --d_type f32_r --ldd 1024 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1024 -n 512 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 1024 --d_type f32_r --ldd 1024 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1152 -n 128 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 1152 --d_type f32_r --ldd 1152 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 128 -n 512 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 128 --d_type f32_r --ldd 128 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 2048 -n 512 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 2304 -n 256 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 2304 --d_type f32_r --ldd 2304 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 256 -n 1024 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 256 --d_type f32_r --ldd 256 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 256 -n 512 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 256 --d_type f32_r --ldd 256 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 256 -n 64 -k 3136 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 3136 --beta 1.0 --c_type f32_r --ldc 256 --d_type f32_r --ldd 256 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 4608 -n 512 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 4608 --d_type f32_r --ldd 4608 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 1024 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 128 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 2048 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 256 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 576 -n 64 -k 3136 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 3136 --beta 1.0 --c_type f32_r --ldc 576 --d_type f32_r --ldd 576 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 64 -n 256 -k 3136 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 3136 --beta 1.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 diff --git a/scripts/performance/sgemm-tf-inception.sh b/scripts/performance/sgemm-tf-inception.sh new file mode 100644 index 000000000..7e68facfa --- /dev/null +++ b/scripts/performance/sgemm-tf-inception.sh @@ -0,0 +1,375 @@ +#!/bin/bash + +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 3025 -n 64 -k 363 --alpha 1.0 --a_type f32_r --lda 3025 --b_type f32_r --ldb 363 --beta 0.0 --c_type f32_r --ldc 3025 --d_type f32_r --ldd 3025 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 729 -n 192 -k 1600 --alpha 1.0 --a_type f32_r --lda 729 --b_type f32_r --ldb 1600 --beta 0.0 --c_type f32_r --ldc 729 --d_type f32_r --ldd 729 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 12544 -n 64 -k 147 --alpha 1.0 --a_type f32_r --lda 12544 --b_type f32_r --ldb 147 --beta 0.0 --c_type f32_r --ldc 12544 --d_type f32_r --ldd 12544 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 1568 -n 128 -k 832 --alpha 1.0 --a_type f32_r --lda 1568 --b_type f32_r --ldb 832 --beta 0.0 --c_type f32_r --ldc 1568 --d_type f32_r --ldd 1568 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 1568 -n 160 -k 832 --alpha 1.0 --a_type f32_r --lda 1568 --b_type f32_r --ldb 832 --beta 0.0 --c_type f32_r --ldc 1568 --d_type f32_r --ldd 1568 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 1568 -n 192 -k 832 --alpha 1.0 --a_type f32_r --lda 1568 --b_type f32_r --ldb 832 --beta 0.0 --c_type f32_r --ldc 1568 --d_type f32_r --ldd 1568 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 1568 -n 256 -k 832 --alpha 1.0 --a_type f32_r --lda 1568 --b_type f32_r --ldb 832 --beta 0.0 --c_type f32_r --ldc 1568 --d_type f32_r --ldd 1568 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 1568 -n 32 -k 832 --alpha 1.0 --a_type f32_r --lda 1568 --b_type f32_r --ldb 832 --beta 0.0 --c_type f32_r --ldc 1568 --d_type f32_r --ldd 1568 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 1568 -n 384 -k 832 --alpha 1.0 --a_type f32_r --lda 1568 --b_type f32_r --ldb 832 --beta 0.0 --c_type f32_r --ldc 1568 --d_type f32_r --ldd 1568 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 1568 -n 48 -k 832 --alpha 1.0 --a_type f32_r --lda 1568 --b_type f32_r --ldb 832 --beta 0.0 --c_type f32_r --ldc 1568 --d_type f32_r --ldd 1568 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 196 -n 128 -k 800 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 800 --beta 0.0 --c_type f32_r --ldc 196 --d_type f32_r --ldd 196 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 196 -n 48 -k 400 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 400 --beta 0.0 --c_type f32_r --ldc 196 --d_type f32_r --ldd 196 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 196 -n 64 -k 600 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 600 --beta 0.0 --c_type f32_r --ldc 196 --d_type f32_r --ldd 196 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 196 -n 64 -k 800 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 800 --beta 0.0 --c_type f32_r --ldc 196 --d_type f32_r --ldd 196 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 49 -n 128 -k 1200 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 1200 --beta 0.0 --c_type f32_r --ldc 49 --d_type f32_r --ldd 49 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 49 -n 128 -k 800 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 800 --beta 0.0 --c_type f32_r --ldc 49 --d_type f32_r --ldd 49 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 112 -k 512 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 128 -k 512 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 128 -k 528 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 528 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 144 -k 512 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 16 -k 480 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 480 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 160 -k 512 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 160 -k 528 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 528 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 192 -k 480 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 480 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 24 -k 512 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 256 -k 528 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 528 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 32 -k 512 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 32 -k 528 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 528 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 64 -k 480 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 480 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 64 -k 512 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 6272 -n 96 -k 480 --alpha 1.0 --a_type f32_r --lda 6272 --b_type f32_r --ldb 480 --beta 0.0 --c_type f32_r --ldc 6272 --d_type f32_r --ldd 6272 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 784 -n 32 -k 400 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 400 --beta 0.0 --c_type f32_r --ldc 784 --d_type f32_r --ldd 784 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 784 -n 96 -k 800 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 800 --beta 0.0 --c_type f32_r --ldc 784 --d_type f32_r --ldd 784 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 1225 -n 64 -k 1200 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1200 --beta 0.0 --c_type f32_r --ldc 1225 --d_type f32_r --ldd 1225 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 2048 -n 192 -k 1280 --alpha 1.0 --a_type f32_r --lda 2048 --b_type f32_r --ldb 1280 --beta 0.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 2048 -n 192 -k 2048 --alpha 1.0 --a_type f32_r --lda 2048 --b_type f32_r --ldb 2048 --beta 0.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 2048 -n 320 -k 1280 --alpha 1.0 --a_type f32_r --lda 2048 --b_type f32_r --ldb 1280 --beta 0.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 2048 -n 320 -k 2048 --alpha 1.0 --a_type f32_r --lda 2048 --b_type f32_r --ldb 2048 --beta 0.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 2048 -n 384 -k 1280 --alpha 1.0 --a_type f32_r --lda 2048 --b_type f32_r --ldb 1280 --beta 0.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 2048 -n 384 -k 2048 --alpha 1.0 --a_type f32_r --lda 2048 --b_type f32_r --ldb 2048 --beta 0.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 2048 -n 448 -k 1280 --alpha 1.0 --a_type f32_r --lda 2048 --b_type f32_r --ldb 1280 --beta 0.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 2048 -n 448 -k 2048 --alpha 1.0 --a_type f32_r --lda 2048 --b_type f32_r --ldb 2048 --beta 0.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 21609 -n 32 -k 288 --alpha 1.0 --a_type f32_r --lda 21609 --b_type f32_r --ldb 288 --beta 0.0 --c_type f32_r --ldc 21609 --d_type f32_r --ldd 21609 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 22201 -n 32 -k 27 --alpha 1.0 --a_type f32_r --lda 22201 --b_type f32_r --ldb 27 --beta 0.0 --c_type f32_r --ldc 22201 --d_type f32_r --ldd 22201 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 128 -k 896 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 896 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 160 -k 1120 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1120 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 192 -k 1120 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1120 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 192 -k 1344 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1344 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 192 -k 896 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 896 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 384 -k 2592 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 2592 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 96 -k 864 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 864 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 5041 -n 192 -k 720 --alpha 1.0 --a_type f32_r --lda 5041 --b_type f32_r --ldb 720 --beta 0.0 --c_type f32_r --ldc 5041 --d_type f32_r --ldd 5041 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 64 -n 192 -k 1728 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1728 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 64 -n 320 -k 1728 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1728 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 64 -n 384 -k 1152 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1152 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 1225 -n 192 -k 1728 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1728 --beta 0.0 --c_type f32_r --ldc 1225 --d_type f32_r --ldd 1225 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 2048 -n 256 -k 1536 --alpha 1.0 --a_type f32_r --lda 2048 --b_type f32_r --ldb 1536 --beta 0.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 2048 -n 384 -k 1536 --alpha 1.0 --a_type f32_r --lda 2048 --b_type f32_r --ldb 1536 --beta 0.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 224 -k 1344 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1344 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 224 -k 1568 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1568 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 256 -k 1568 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1568 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 256 -k 1792 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1792 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 256 -k 2016 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 2016 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 320 -k 1792 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1792 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 289 -n 384 -k 3456 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 3456 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 5041 -n 96 -k 576 --alpha 1.0 --a_type f32_r --lda 5041 --b_type f32_r --ldb 576 --beta 0.0 --c_type f32_r --ldc 5041 --d_type f32_r --ldd 5041 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 5329 -n 64 -k 448 --alpha 1.0 --a_type f32_r --lda 5329 --b_type f32_r --ldb 448 --beta 0.0 --c_type f32_r --ldc 5329 --d_type f32_r --ldd 5329 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 5329 -n 96 -k 576 --alpha 1.0 --a_type f32_r --lda 5329 --b_type f32_r --ldb 576 --beta 0.0 --c_type f32_r --ldc 5329 --d_type f32_r --ldd 5329 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 64 -n 256 -k 1152 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1152 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 64 -n 256 -k 1536 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1536 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 64 -n 320 -k 2880 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 2880 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 64 -n 448 -k 1152 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1152 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 64 -n 512 -k 1344 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1344 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 12544 -n 1024 -k 256 --alpha 1.0 --a_type f32_r --lda 12544 --b_type f32_r --ldb 256 --beta 0.0 --c_type f32_r --ldc 12544 --d_type f32_r --ldd 12544 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 12544 -n 1024 -k 512 --alpha 1.0 --a_type f32_r --lda 12544 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 12544 --d_type f32_r --ldd 12544 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 12544 -n 256 -k 1024 --alpha 1.0 --a_type f32_r --lda 12544 --b_type f32_r --ldb 1024 --beta 0.0 --c_type f32_r --ldc 12544 --d_type f32_r --ldd 12544 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 12544 -n 256 -k 512 --alpha 1.0 --a_type f32_r --lda 12544 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 12544 --d_type f32_r --ldd 12544 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 3136 -n 2048 -k 1024 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 1024 --beta 0.0 --c_type f32_r --ldc 3136 --d_type f32_r --ldd 3136 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 3136 -n 2048 -k 512 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 3136 --d_type f32_r --ldd 3136 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 3136 -n 512 -k 1024 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 1024 --beta 0.0 --c_type f32_r --ldc 3136 --d_type f32_r --ldd 3136 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 3136 -n 512 -k 2048 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 2048 --beta 0.0 --c_type f32_r --ldc 3136 --d_type f32_r --ldd 3136 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 50176 -n 128 -k 256 --alpha 1.0 --a_type f32_r --lda 50176 --b_type f32_r --ldb 256 --beta 0.0 --c_type f32_r --ldc 50176 --d_type f32_r --ldd 50176 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 50176 -n 512 -k 256 --alpha 1.0 --a_type f32_r --lda 50176 --b_type f32_r --ldb 256 --beta 0.0 --c_type f32_r --ldc 50176 --d_type f32_r --ldd 50176 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 729 -n 1600 -k 192 --alpha 1.0 --a_type f32_r --lda 729 --b_type f32_r --ldb 1600 --beta 0.0 --c_type f32_r --ldc 729 --d_type f32_r --ldd 729 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 196 -n 400 -k 48 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 400 --beta 0.0 --c_type f32_r --ldc 196 --d_type f32_r --ldd 196 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 196 -n 600 -k 64 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 600 --beta 0.0 --c_type f32_r --ldc 196 --d_type f32_r --ldd 196 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 196 -n 800 -k 128 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 800 --beta 0.0 --c_type f32_r --ldc 196 --d_type f32_r --ldd 196 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 196 -n 800 -k 64 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 800 --beta 0.0 --c_type f32_r --ldc 196 --d_type f32_r --ldd 196 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 49 -n 1200 -k 128 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 1200 --beta 0.0 --c_type f32_r --ldc 49 --d_type f32_r --ldd 49 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 49 -n 800 -k 128 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 800 --beta 0.0 --c_type f32_r --ldc 49 --d_type f32_r --ldd 49 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 784 -n 400 -k 32 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 400 --beta 0.0 --c_type f32_r --ldc 784 --d_type f32_r --ldd 784 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 784 -n 800 -k 96 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 800 --beta 0.0 --c_type f32_r --ldc 784 --d_type f32_r --ldd 784 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 1225 -n 1200 -k 64 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1200 --beta 0.0 --c_type f32_r --ldc 1225 --d_type f32_r --ldd 1225 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 21609 -n 288 -k 32 --alpha 1.0 --a_type f32_r --lda 21609 --b_type f32_r --ldb 288 --beta 0.0 --c_type f32_r --ldc 21609 --d_type f32_r --ldd 21609 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 1120 -k 160 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1120 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 1120 -k 192 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1120 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 1344 -k 192 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1344 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 2592 -k 384 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 2592 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 864 -k 96 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 864 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 896 -k 128 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 896 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 896 -k 192 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 896 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 5041 -n 720 -k 192 --alpha 1.0 --a_type f32_r --lda 5041 --b_type f32_r --ldb 720 --beta 0.0 --c_type f32_r --ldc 5041 --d_type f32_r --ldd 5041 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 64 -n 1152 -k 384 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1152 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 64 -n 1728 -k 192 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1728 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 64 -n 1728 -k 320 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1728 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 1225 -n 1728 -k 192 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1728 --beta 0.0 --c_type f32_r --ldc 1225 --d_type f32_r --ldd 1225 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 1344 -k 224 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1344 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 1568 -k 224 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1568 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 1568 -k 256 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1568 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 1792 -k 256 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1792 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 1792 -k 320 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 1792 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 2016 -k 256 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 2016 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 289 -n 3456 -k 384 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 3456 --beta 0.0 --c_type f32_r --ldc 289 --d_type f32_r --ldd 289 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 5041 -n 576 -k 96 --alpha 1.0 --a_type f32_r --lda 5041 --b_type f32_r --ldb 576 --beta 0.0 --c_type f32_r --ldc 5041 --d_type f32_r --ldd 5041 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 5329 -n 448 -k 64 --alpha 1.0 --a_type f32_r --lda 5329 --b_type f32_r --ldb 448 --beta 0.0 --c_type f32_r --ldc 5329 --d_type f32_r --ldd 5329 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 5329 -n 576 -k 96 --alpha 1.0 --a_type f32_r --lda 5329 --b_type f32_r --ldb 576 --beta 0.0 --c_type f32_r --ldc 5329 --d_type f32_r --ldd 5329 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 64 -n 1152 -k 256 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1152 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 64 -n 1152 -k 448 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1152 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 64 -n 1344 -k 512 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1344 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 64 -n 1536 -k 256 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 1536 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 64 -n 2880 -k 320 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 2880 --beta 0.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 12544 -n 512 -k 1024 --alpha 1.0 --a_type f32_r --lda 12544 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 12544 --d_type f32_r --ldd 12544 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 12544 -n 512 -k 256 --alpha 1.0 --a_type f32_r --lda 12544 --b_type f32_r --ldb 512 --beta 0.0 --c_type f32_r --ldc 12544 --d_type f32_r --ldd 12544 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 3136 -n 1024 -k 2048 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 1024 --beta 0.0 --c_type f32_r --ldc 3136 --d_type f32_r --ldd 3136 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 3136 -n 1024 -k 512 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 1024 --beta 0.0 --c_type f32_r --ldc 3136 --d_type f32_r --ldd 3136 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 50176 -n 256 -k 128 --alpha 1.0 --a_type f32_r --lda 50176 --b_type f32_r --ldb 256 --beta 0.0 --c_type f32_r --ldc 50176 --d_type f32_r --ldd 50176 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 50176 -n 256 -k 512 --alpha 1.0 --a_type f32_r --lda 50176 --b_type f32_r --ldb 256 --beta 0.0 --c_type f32_r --ldc 50176 --d_type f32_r --ldd 50176 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 3136 -n 64 -k 64 --alpha 1.0 --a_type f32_r --lda 3136 --stride_a 200704 --b_type f32_r --ldb 64 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 3136 --stride_c 200704 --d_type f32_r --ldd 3136 --stride_d 200704 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 784 -n 128 -k 256 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 200704 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 100352 --d_type f32_r --ldd 784 --stride_d 100352 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 784 -n 16 -k 192 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 150528 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 12544 --d_type f32_r --ldd 784 --stride_d 12544 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 784 -n 32 -k 192 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 150528 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 25088 --d_type f32_r --ldd 784 --stride_d 25088 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 784 -n 32 -k 256 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 200704 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 25088 --d_type f32_r --ldd 784 --stride_d 25088 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 784 -n 64 -k 192 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 150528 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 50176 --d_type f32_r --ldd 784 --stride_d 50176 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 784 -n 64 -k 256 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 200704 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 50176 --d_type f32_r --ldd 784 --stride_d 50176 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 784 -n 96 -k 192 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 150528 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 75264 --d_type f32_r --ldd 784 --stride_d 75264 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 1225 -n 32 -k 192 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 235200 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 39200 --d_type f32_r --ldd 1225 --stride_d 39200 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 1225 -n 48 -k 192 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 235200 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 58800 --d_type f32_r --ldd 1225 --stride_d 58800 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 1225 -n 48 -k 256 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 313600 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 58800 --d_type f32_r --ldd 1225 --stride_d 58800 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 1225 -n 48 -k 288 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 352800 --b_type f32_r --ldb 288 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 58800 --d_type f32_r --ldd 1225 --stride_d 58800 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 1225 -n 64 -k 192 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 235200 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 78400 --d_type f32_r --ldd 1225 --stride_d 78400 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 1225 -n 64 -k 256 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 313600 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 78400 --d_type f32_r --ldd 1225 --stride_d 78400 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 1225 -n 64 -k 288 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 352800 --b_type f32_r --ldb 288 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 78400 --d_type f32_r --ldd 1225 --stride_d 78400 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 289 -n 128 -k 768 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 221952 --b_type f32_r --ldb 768 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 36992 --d_type f32_r --ldd 289 --stride_d 36992 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 289 -n 160 -k 768 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 221952 --b_type f32_r --ldb 768 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 46240 --d_type f32_r --ldd 289 --stride_d 46240 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 289 -n 192 -k 768 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 221952 --b_type f32_r --ldb 768 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 55488 --d_type f32_r --ldd 289 --stride_d 55488 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 5329 -n 80 -k 64 --alpha 1.0 --a_type f32_r --lda 5329 --stride_a 341056 --b_type f32_r --ldb 64 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 5329 --stride_c 426320 --d_type f32_r --ldd 5329 --stride_d 426320 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 1225 -n 192 -k 384 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 470400 --b_type f32_r --ldb 384 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 235200 --d_type f32_r --ldd 1225 --stride_d 235200 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 1225 -n 64 -k 384 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 470400 --b_type f32_r --ldb 384 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 78400 --d_type f32_r --ldd 1225 --stride_d 78400 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 1225 -n 96 -k 384 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 470400 --b_type f32_r --ldb 384 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 117600 --d_type f32_r --ldd 1225 --stride_d 117600 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 289 -n 128 -k 1024 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 295936 --b_type f32_r --ldb 1024 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 36992 --d_type f32_r --ldd 289 --stride_d 36992 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 289 -n 192 -k 1024 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 295936 --b_type f32_r --ldb 1024 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 55488 --d_type f32_r --ldd 289 --stride_d 55488 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 289 -n 256 -k 1024 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 295936 --b_type f32_r --ldb 1024 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 73984 --d_type f32_r --ldd 289 --stride_d 73984 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 289 -n 384 -k 1024 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 295936 --b_type f32_r --ldb 1024 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 110976 --d_type f32_r --ldd 289 --stride_d 110976 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 5329 -n 64 -k 160 --alpha 1.0 --a_type f32_r --lda 5329 --stride_a 852640 --b_type f32_r --ldb 160 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 5329 --stride_c 341056 --d_type f32_r --ldd 5329 --stride_d 341056 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 3136 -n 256 -k 64 --alpha 1.0 --a_type f32_r --lda 3136 --stride_a 200704 --b_type f32_r --ldb 64 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 3136 --stride_c 802816 --d_type f32_r --ldd 3136 --stride_d 802816 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 3136 -n 64 -k 256 --alpha 1.0 --a_type f32_r --lda 3136 --stride_a 802816 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 3136 --stride_c 200704 --d_type f32_r --ldd 3136 --stride_d 200704 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 3136 -n 64 -k 64 --alpha 1.0 --a_type f32_r --lda 3136 --stride_a 200704 --b_type f32_r --ldb 64 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 3136 --stride_c 200704 --d_type f32_r --ldd 3136 --stride_d 200704 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 784 -n 128 -k 512 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 401408 --b_type f32_r --ldb 512 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 100352 --d_type f32_r --ldd 784 --stride_d 100352 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 784 -n 512 -k 128 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 100352 --b_type f32_r --ldb 128 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 401408 --d_type f32_r --ldd 784 --stride_d 401408 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 480 -k 16 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 3136 --b_type f32_r --ldb 480 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 94080 --d_type f32_r --ldd 196 --stride_d 94080 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 480 -k 192 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 37632 --b_type f32_r --ldb 480 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 94080 --d_type f32_r --ldd 196 --stride_d 94080 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 480 -k 64 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 12544 --b_type f32_r --ldb 480 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 94080 --d_type f32_r --ldd 196 --stride_d 94080 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 480 -k 96 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 18816 --b_type f32_r --ldb 480 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 94080 --d_type f32_r --ldd 196 --stride_d 94080 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 512 -k 112 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 21952 --b_type f32_r --ldb 512 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 100352 --d_type f32_r --ldd 196 --stride_d 100352 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 512 -k 128 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 25088 --b_type f32_r --ldb 512 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 100352 --d_type f32_r --ldd 196 --stride_d 100352 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 512 -k 144 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 28224 --b_type f32_r --ldb 512 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 100352 --d_type f32_r --ldd 196 --stride_d 100352 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 512 -k 160 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 31360 --b_type f32_r --ldb 512 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 100352 --d_type f32_r --ldd 196 --stride_d 100352 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 512 -k 24 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 4704 --b_type f32_r --ldb 512 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 100352 --d_type f32_r --ldd 196 --stride_d 100352 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 512 -k 32 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 6272 --b_type f32_r --ldb 512 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 100352 --d_type f32_r --ldd 196 --stride_d 100352 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 512 -k 64 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 12544 --b_type f32_r --ldb 512 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 100352 --d_type f32_r --ldd 196 --stride_d 100352 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 528 -k 128 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 25088 --b_type f32_r --ldb 528 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 103488 --d_type f32_r --ldd 196 --stride_d 103488 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 528 -k 160 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 31360 --b_type f32_r --ldb 528 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 103488 --d_type f32_r --ldd 196 --stride_d 103488 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 528 -k 256 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 50176 --b_type f32_r --ldb 528 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 103488 --d_type f32_r --ldd 196 --stride_d 103488 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 528 -k 32 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 6272 --b_type f32_r --ldb 528 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 103488 --d_type f32_r --ldd 196 --stride_d 103488 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 3136 -n 64 -k 64 --alpha 1.0 --a_type f32_r --lda 3136 --stride_a 200704 --b_type f32_r --ldb 64 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 3136 --stride_c 200704 --d_type f32_r --ldd 3136 --stride_d 200704 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 49 -n 832 -k 128 --alpha 1.0 --a_type f32_r --lda 49 --stride_a 6272 --b_type f32_r --ldb 832 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 49 --stride_c 40768 --d_type f32_r --ldd 49 --stride_d 40768 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 49 -n 832 -k 160 --alpha 1.0 --a_type f32_r --lda 49 --stride_a 7840 --b_type f32_r --ldb 832 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 49 --stride_c 40768 --d_type f32_r --ldd 49 --stride_d 40768 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 49 -n 832 -k 192 --alpha 1.0 --a_type f32_r --lda 49 --stride_a 9408 --b_type f32_r --ldb 832 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 49 --stride_c 40768 --d_type f32_r --ldd 49 --stride_d 40768 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 49 -n 832 -k 256 --alpha 1.0 --a_type f32_r --lda 49 --stride_a 12544 --b_type f32_r --ldb 832 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 49 --stride_c 40768 --d_type f32_r --ldd 49 --stride_d 40768 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 49 -n 832 -k 32 --alpha 1.0 --a_type f32_r --lda 49 --stride_a 1568 --b_type f32_r --ldb 832 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 49 --stride_c 40768 --d_type f32_r --ldd 49 --stride_d 40768 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 49 -n 832 -k 384 --alpha 1.0 --a_type f32_r --lda 49 --stride_a 18816 --b_type f32_r --ldb 832 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 49 --stride_c 40768 --d_type f32_r --ldd 49 --stride_d 40768 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 49 -n 832 -k 48 --alpha 1.0 --a_type f32_r --lda 49 --stride_a 2352 --b_type f32_r --ldb 832 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 49 --stride_c 40768 --d_type f32_r --ldd 49 --stride_d 40768 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 784 -n 192 -k 16 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 12544 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 150528 --d_type f32_r --ldd 784 --stride_d 150528 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 784 -n 192 -k 32 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 25088 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 150528 --d_type f32_r --ldd 784 --stride_d 150528 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 784 -n 192 -k 64 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 50176 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 150528 --d_type f32_r --ldd 784 --stride_d 150528 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 784 -n 192 -k 96 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 75264 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 150528 --d_type f32_r --ldd 784 --stride_d 150528 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 784 -n 256 -k 128 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 100352 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 200704 --d_type f32_r --ldd 784 --stride_d 200704 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 784 -n 256 -k 32 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 25088 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 200704 --d_type f32_r --ldd 784 --stride_d 200704 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 784 -n 256 -k 64 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 50176 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 200704 --d_type f32_r --ldd 784 --stride_d 200704 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1225 -n 192 -k 32 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 39200 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 235200 --d_type f32_r --ldd 1225 --stride_d 235200 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1225 -n 192 -k 48 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 58800 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 235200 --d_type f32_r --ldd 1225 --stride_d 235200 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1225 -n 192 -k 64 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 78400 --b_type f32_r --ldb 192 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 235200 --d_type f32_r --ldd 1225 --stride_d 235200 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1225 -n 256 -k 48 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 58800 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 313600 --d_type f32_r --ldd 1225 --stride_d 313600 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1225 -n 256 -k 64 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 78400 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 313600 --d_type f32_r --ldd 1225 --stride_d 313600 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1225 -n 288 -k 48 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 58800 --b_type f32_r --ldb 288 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 352800 --d_type f32_r --ldd 1225 --stride_d 352800 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1225 -n 288 -k 64 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 78400 --b_type f32_r --ldb 288 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 352800 --d_type f32_r --ldd 1225 --stride_d 352800 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 289 -n 768 -k 128 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 36992 --b_type f32_r --ldb 768 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 221952 --d_type f32_r --ldd 289 --stride_d 221952 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 289 -n 768 -k 160 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 46240 --b_type f32_r --ldb 768 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 221952 --d_type f32_r --ldd 289 --stride_d 221952 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 289 -n 768 -k 192 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 55488 --b_type f32_r --ldb 768 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 221952 --d_type f32_r --ldd 289 --stride_d 221952 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 5329 -n 64 -k 80 --alpha 1.0 --a_type f32_r --lda 5329 --stride_a 426320 --b_type f32_r --ldb 64 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 5329 --stride_c 341056 --d_type f32_r --ldd 5329 --stride_d 341056 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 64 -n 1280 -k 192 --alpha 1.0 --a_type f32_r --lda 64 --stride_a 12288 --b_type f32_r --ldb 1280 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 81920 --d_type f32_r --ldd 64 --stride_d 81920 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 64 -n 1280 -k 320 --alpha 1.0 --a_type f32_r --lda 64 --stride_a 20480 --b_type f32_r --ldb 1280 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 81920 --d_type f32_r --ldd 64 --stride_d 81920 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 64 -n 1280 -k 384 --alpha 1.0 --a_type f32_r --lda 64 --stride_a 24576 --b_type f32_r --ldb 1280 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 81920 --d_type f32_r --ldd 64 --stride_d 81920 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 64 -n 1280 -k 448 --alpha 1.0 --a_type f32_r --lda 64 --stride_a 28672 --b_type f32_r --ldb 1280 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 81920 --d_type f32_r --ldd 64 --stride_d 81920 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 64 -n 2048 -k 192 --alpha 1.0 --a_type f32_r --lda 64 --stride_a 12288 --b_type f32_r --ldb 2048 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 131072 --d_type f32_r --ldd 64 --stride_d 131072 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 64 -n 2048 -k 320 --alpha 1.0 --a_type f32_r --lda 64 --stride_a 20480 --b_type f32_r --ldb 2048 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 131072 --d_type f32_r --ldd 64 --stride_d 131072 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 64 -n 2048 -k 384 --alpha 1.0 --a_type f32_r --lda 64 --stride_a 24576 --b_type f32_r --ldb 2048 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 131072 --d_type f32_r --ldd 64 --stride_d 131072 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 64 -n 2048 -k 448 --alpha 1.0 --a_type f32_r --lda 64 --stride_a 28672 --b_type f32_r --ldb 2048 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 131072 --d_type f32_r --ldd 64 --stride_d 131072 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1225 -n 384 -k 192 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 235200 --b_type f32_r --ldb 384 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 470400 --d_type f32_r --ldd 1225 --stride_d 470400 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1225 -n 384 -k 64 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 78400 --b_type f32_r --ldb 384 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 470400 --d_type f32_r --ldd 1225 --stride_d 470400 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1225 -n 384 -k 96 --alpha 1.0 --a_type f32_r --lda 1225 --stride_a 117600 --b_type f32_r --ldb 384 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 1225 --stride_c 470400 --d_type f32_r --ldd 1225 --stride_d 470400 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 289 -n 1024 -k 128 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 36992 --b_type f32_r --ldb 1024 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 295936 --d_type f32_r --ldd 289 --stride_d 295936 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 289 -n 1024 -k 192 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 55488 --b_type f32_r --ldb 1024 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 295936 --d_type f32_r --ldd 289 --stride_d 295936 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 289 -n 1024 -k 256 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 73984 --b_type f32_r --ldb 1024 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 295936 --d_type f32_r --ldd 289 --stride_d 295936 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 289 -n 1024 -k 384 --alpha 1.0 --a_type f32_r --lda 289 --stride_a 110976 --b_type f32_r --ldb 1024 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 289 --stride_c 295936 --d_type f32_r --ldd 289 --stride_d 295936 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 5329 -n 160 -k 64 --alpha 1.0 --a_type f32_r --lda 5329 --stride_a 341056 --b_type f32_r --ldb 160 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 5329 --stride_c 852640 --d_type f32_r --ldd 5329 --stride_d 852640 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 64 -n 1536 -k 256 --alpha 1.0 --a_type f32_r --lda 64 --stride_a 16384 --b_type f32_r --ldb 1536 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 98304 --d_type f32_r --ldd 64 --stride_d 98304 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 64 -n 1536 -k 384 --alpha 1.0 --a_type f32_r --lda 64 --stride_a 24576 --b_type f32_r --ldb 1536 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 98304 --d_type f32_r --ldd 64 --stride_d 98304 --batch 32 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 1024 -k 256 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 50176 --b_type f32_r --ldb 1024 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 200704 --d_type f32_r --ldd 196 --stride_d 200704 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 196 -n 256 -k 1024 --alpha 1.0 --a_type f32_r --lda 196 --stride_a 200704 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 196 --stride_c 50176 --d_type f32_r --ldd 196 --stride_d 50176 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 3136 -n 256 -k 64 --alpha 1.0 --a_type f32_r --lda 3136 --stride_a 200704 --b_type f32_r --ldb 256 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 3136 --stride_c 802816 --d_type f32_r --ldd 3136 --stride_d 802816 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 3136 -n 64 -k 256 --alpha 1.0 --a_type f32_r --lda 3136 --stride_a 802816 --b_type f32_r --ldb 64 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 3136 --stride_c 200704 --d_type f32_r --ldd 3136 --stride_d 200704 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 3136 -n 64 -k 64 --alpha 1.0 --a_type f32_r --lda 3136 --stride_a 200704 --b_type f32_r --ldb 64 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 3136 --stride_c 200704 --d_type f32_r --ldd 3136 --stride_d 200704 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 49 -n 2048 -k 512 --alpha 1.0 --a_type f32_r --lda 49 --stride_a 25088 --b_type f32_r --ldb 2048 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 49 --stride_c 100352 --d_type f32_r --ldd 49 --stride_d 100352 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 49 -n 512 -k 2048 --alpha 1.0 --a_type f32_r --lda 49 --stride_a 100352 --b_type f32_r --ldb 512 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 49 --stride_c 25088 --d_type f32_r --ldd 49 --stride_d 25088 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 784 -n 128 -k 512 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 401408 --b_type f32_r --ldb 128 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 100352 --d_type f32_r --ldd 784 --stride_d 100352 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 784 -n 512 -k 128 --alpha 1.0 --a_type f32_r --lda 784 --stride_a 100352 --b_type f32_r --ldb 512 --stride_b 0 --beta 0.0 --c_type f32_r --ldc 784 --stride_c 401408 --d_type f32_r --ldd 784 --stride_d 401408 --batch 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 128 -n 96 -k 1568 --alpha 1.0 --a_type f32_r --lda 1568 --stride_a 200704 --b_type f32_r --ldb 1568 --stride_b 150528 --beta 0.0 --c_type f32_r --ldc 128 --stride_c 12288 --d_type f32_r --ldd 128 --stride_d 12288 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 192 -n 128 -k 1568 --alpha 1.0 --a_type f32_r --lda 1568 --stride_a 301056 --b_type f32_r --ldb 1568 --stride_b 200704 --beta 0.0 --c_type f32_r --ldc 192 --stride_c 24576 --d_type f32_r --ldd 192 --stride_d 24576 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 192 -n 64 -k 6272 --alpha 1.0 --a_type f32_r --lda 6272 --stride_a 1204224 --b_type f32_r --ldb 6272 --stride_b 401408 --beta 0.0 --c_type f32_r --ldc 192 --stride_c 12288 --d_type f32_r --ldd 192 --stride_d 12288 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 192 -n 80 -k 10368 --alpha 1.0 --a_type f32_r --lda 10368 --stride_a 1990656 --b_type f32_r --ldb 10368 --stride_b 829440 --beta 0.0 --c_type f32_r --ldc 192 --stride_c 15360 --d_type f32_r --ldd 192 --stride_d 15360 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 32 -n 32 -k 43808 --alpha 1.0 --a_type f32_r --lda 43808 --stride_a 1401856 --b_type f32_r --ldb 43808 --stride_b 1401856 --beta 0.0 --c_type f32_r --ldc 32 --stride_c 1024 --d_type f32_r --ldd 32 --stride_d 1024 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 64 -n 32 -k 43808 --alpha 1.0 --a_type f32_r --lda 43808 --stride_a 2803712 --b_type f32_r --ldb 43808 --stride_b 1401856 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 2048 --d_type f32_r --ldd 64 --stride_d 2048 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 96 -n 64 -k 2592 --alpha 1.0 --a_type f32_r --lda 2592 --stride_a 248832 --b_type f32_r --ldb 2592 --stride_b 165888 --beta 0.0 --c_type f32_r --ldc 96 --stride_c 6144 --d_type f32_r --ldd 96 --stride_d 6144 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 96 -n 96 -k 2592 --alpha 1.0 --a_type f32_r --lda 2592 --stride_a 248832 --b_type f32_r --ldb 2592 --stride_b 248832 --beta 0.0 --c_type f32_r --ldc 96 --stride_c 9216 --d_type f32_r --ldd 96 --stride_d 9216 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 224 -n 192 -k 2592 --alpha 1.0 --a_type f32_r --lda 2592 --stride_a 580608 --b_type f32_r --ldb 2592 --stride_b 497664 --beta 0.0 --c_type f32_r --ldc 224 --stride_c 43008 --d_type f32_r --ldd 224 --stride_d 43008 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 96 -n 64 -k 10368 --alpha 1.0 --a_type f32_r --lda 10368 --stride_a 995328 --b_type f32_r --ldb 10368 --stride_b 663552 --beta 0.0 --c_type f32_r --ldc 96 --stride_c 6144 --d_type f32_r --ldd 96 --stride_d 6144 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 128 -n 128 -k 3136 --alpha 1.0 --a_type f32_r --lda 3136 --stride_a 401408 --b_type f32_r --ldb 3136 --stride_b 401408 --beta 0.0 --c_type f32_r --ldc 128 --stride_c 16384 --d_type f32_r --ldd 128 --stride_d 16384 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 64 -n 64 -k 12544 --alpha 1.0 --a_type f32_r --lda 12544 --stride_a 802816 --b_type f32_r --ldb 12544 --stride_b 802816 --beta 0.0 --c_type f32_r --ldc 64 --stride_c 4096 --d_type f32_r --ldd 64 --stride_d 4096 --batch 36 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1600 -n 192 -k 729 --alpha 1.0 --a_type f32_r --lda 729 --b_type f32_r --ldb 729 --beta 1.0 --c_type f32_r --ldc 1600 --d_type f32_r --ldd 1600 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1728 -n 384 -k 169 --alpha 1.0 --a_type f32_r --lda 169 --b_type f32_r --ldb 169 --beta 1.0 --c_type f32_r --ldc 1728 --d_type f32_r --ldd 1728 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 3456 -n 256 -k 169 --alpha 1.0 --a_type f32_r --lda 169 --b_type f32_r --ldb 169 --beta 1.0 --c_type f32_r --ldc 3456 --d_type f32_r --ldd 3456 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 3456 -n 384 -k 169 --alpha 1.0 --a_type f32_r --lda 169 --b_type f32_r --ldb 169 --beta 1.0 --c_type f32_r --ldc 3456 --d_type f32_r --ldd 3456 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 363 -n 64 -k 3025 --alpha 1.0 --a_type f32_r --lda 3025 --b_type f32_r --ldb 3025 --beta 1.0 --c_type f32_r --ldc 363 --d_type f32_r --ldd 363 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1008 -n 224 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 1008 --d_type f32_r --ldd 1008 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1152 -n 192 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 1152 --d_type f32_r --ldd 1152 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1152 -n 256 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 1152 --d_type f32_r --ldd 1152 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1200 -n 128 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 1200 --d_type f32_r --ldd 1200 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1296 -n 288 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 1296 --d_type f32_r --ldd 1296 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1440 -n 320 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 1440 --d_type f32_r --ldd 1440 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1440 -n 320 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 1440 --d_type f32_r --ldd 1440 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 147 -n 64 -k 12544 --alpha 1.0 --a_type f32_r --lda 12544 --b_type f32_r --ldb 12544 --beta 1.0 --c_type f32_r --ldc 147 --d_type f32_r --ldd 147 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1728 -n 384 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 1728 --d_type f32_r --ldd 1728 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 192 -n 16 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 192 --d_type f32_r --ldd 192 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 192 -n 32 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 192 --d_type f32_r --ldd 192 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 192 -n 64 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 192 --d_type f32_r --ldd 192 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 192 -n 96 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 192 --d_type f32_r --ldd 192 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 256 -n 128 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 256 --d_type f32_r --ldd 256 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 256 -n 32 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 256 --d_type f32_r --ldd 256 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 256 -n 64 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 256 --d_type f32_r --ldd 256 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 400 -n 32 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 400 --d_type f32_r --ldd 400 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 400 -n 48 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 400 --d_type f32_r --ldd 400 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 480 -n 16 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 480 --d_type f32_r --ldd 480 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 480 -n 192 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 480 --d_type f32_r --ldd 480 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 480 -n 64 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 480 --d_type f32_r --ldd 480 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 480 -n 96 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 480 --d_type f32_r --ldd 480 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 112 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 128 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 144 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 160 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 24 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 32 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 64 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 528 -n 128 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 528 --d_type f32_r --ldd 528 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 528 -n 160 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 528 --d_type f32_r --ldd 528 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 528 -n 256 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 528 --d_type f32_r --ldd 528 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 528 -n 32 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 528 --d_type f32_r --ldd 528 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 576 -n 192 -k 3136 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 3136 --beta 1.0 --c_type f32_r --ldc 576 --d_type f32_r --ldd 576 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 600 -n 64 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 600 --d_type f32_r --ldd 600 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 64 -n 64 -k 3136 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 3136 --beta 1.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 800 -n 128 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 800 --d_type f32_r --ldd 800 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 800 -n 128 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 800 --d_type f32_r --ldd 800 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 800 -n 64 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 800 --d_type f32_r --ldd 800 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 800 -n 96 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 800 --d_type f32_r --ldd 800 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 832 -n 128 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 832 --d_type f32_r --ldd 832 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 832 -n 160 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 832 --d_type f32_r --ldd 832 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 832 -n 192 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 832 --d_type f32_r --ldd 832 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 832 -n 256 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 832 --d_type f32_r --ldd 832 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 832 -n 32 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 832 --d_type f32_r --ldd 832 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 832 -n 384 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 832 --d_type f32_r --ldd 832 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 832 -n 48 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 832 --d_type f32_r --ldd 832 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 864 -n 128 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 864 --d_type f32_r --ldd 864 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 864 -n 208 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 864 --d_type f32_r --ldd 864 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1120 -n 160 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1120 --d_type f32_r --ldd 1120 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1120 -n 192 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1120 --d_type f32_r --ldd 1120 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1152 -n 384 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1152 --d_type f32_r --ldd 1152 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1200 -n 64 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 1200 --d_type f32_r --ldd 1200 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1280 -n 192 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1280 --d_type f32_r --ldd 1280 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1280 -n 320 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1280 --d_type f32_r --ldd 1280 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1280 -n 384 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1280 --d_type f32_r --ldd 1280 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1280 -n 448 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1280 --d_type f32_r --ldd 1280 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1344 -n 192 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1344 --d_type f32_r --ldd 1344 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1728 -n 192 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1728 --d_type f32_r --ldd 1728 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1728 -n 320 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1728 --d_type f32_r --ldd 1728 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 192 -n 32 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 192 --d_type f32_r --ldd 192 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 192 -n 48 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 192 --d_type f32_r --ldd 192 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 192 -n 64 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 192 --d_type f32_r --ldd 192 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 2048 -n 192 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 2048 -n 320 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 2048 -n 384 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 2048 -n 448 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 256 -n 48 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 256 --d_type f32_r --ldd 256 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 256 -n 64 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 256 --d_type f32_r --ldd 256 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 2592 -n 384 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 2592 --d_type f32_r --ldd 2592 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 27 -n 32 -k 22201 --alpha 1.0 --a_type f32_r --lda 22201 --b_type f32_r --ldb 22201 --beta 1.0 --c_type f32_r --ldc 27 --d_type f32_r --ldd 27 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 288 -n 32 -k 21609 --alpha 1.0 --a_type f32_r --lda 21609 --b_type f32_r --ldb 21609 --beta 1.0 --c_type f32_r --ldc 288 --d_type f32_r --ldd 288 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 288 -n 48 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 288 --d_type f32_r --ldd 288 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 288 -n 64 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 288 --d_type f32_r --ldd 288 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 288 -n 64 -k 21609 --alpha 1.0 --a_type f32_r --lda 21609 --b_type f32_r --ldb 21609 --beta 1.0 --c_type f32_r --ldc 288 --d_type f32_r --ldd 288 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 4032 -n 384 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 4032 --d_type f32_r --ldd 4032 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 576 -n 96 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 576 --d_type f32_r --ldd 576 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 64 -n 80 -k 5329 --alpha 1.0 --a_type f32_r --lda 5329 --b_type f32_r --ldb 5329 --beta 1.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 720 -n 192 -k 5041 --alpha 1.0 --a_type f32_r --lda 5041 --b_type f32_r --ldb 5041 --beta 1.0 --c_type f32_r --ldc 720 --d_type f32_r --ldd 720 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 768 -n 128 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 768 --d_type f32_r --ldd 768 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 768 -n 160 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 768 --d_type f32_r --ldd 768 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 768 -n 192 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 768 --d_type f32_r --ldd 768 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 864 -n 96 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 864 --d_type f32_r --ldd 864 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 864 -n 96 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 864 --d_type f32_r --ldd 864 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 896 -n 128 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 896 --d_type f32_r --ldd 896 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 896 -n 192 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 896 --d_type f32_r --ldd 896 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1024 -n 128 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1024 --d_type f32_r --ldd 1024 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1024 -n 192 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1024 --d_type f32_r --ldd 1024 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1024 -n 256 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1024 --d_type f32_r --ldd 1024 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1024 -n 384 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1024 --d_type f32_r --ldd 1024 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1152 -n 256 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1152 --d_type f32_r --ldd 1152 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1152 -n 448 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1152 --d_type f32_r --ldd 1152 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1344 -n 224 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1344 --d_type f32_r --ldd 1344 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1344 -n 512 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1344 --d_type f32_r --ldd 1344 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1536 -n 256 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1536 --d_type f32_r --ldd 1536 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1536 -n 384 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 1536 --d_type f32_r --ldd 1536 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1568 -n 224 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1568 --d_type f32_r --ldd 1568 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1568 -n 256 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1568 --d_type f32_r --ldd 1568 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 160 -n 64 -k 5329 --alpha 1.0 --a_type f32_r --lda 5329 --b_type f32_r --ldb 5329 --beta 1.0 --c_type f32_r --ldc 160 --d_type f32_r --ldd 160 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1728 -n 192 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 1728 --d_type f32_r --ldd 1728 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1728 -n 224 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 1728 --d_type f32_r --ldd 1728 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1792 -n 256 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1792 --d_type f32_r --ldd 1792 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1792 -n 320 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 1792 --d_type f32_r --ldd 1792 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 2016 -n 256 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 2016 --d_type f32_r --ldd 2016 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 2880 -n 320 -k 64 --alpha 1.0 --a_type f32_r --lda 64 --b_type f32_r --ldb 64 --beta 1.0 --c_type f32_r --ldc 2880 --d_type f32_r --ldd 2880 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 3456 -n 384 -k 289 --alpha 1.0 --a_type f32_r --lda 289 --b_type f32_r --ldb 289 --beta 1.0 --c_type f32_r --ldc 3456 --d_type f32_r --ldd 3456 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 384 -n 192 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 384 --d_type f32_r --ldd 384 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 384 -n 64 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 384 --d_type f32_r --ldd 384 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 384 -n 96 -k 1225 --alpha 1.0 --a_type f32_r --lda 1225 --b_type f32_r --ldb 1225 --beta 1.0 --c_type f32_r --ldc 384 --d_type f32_r --ldd 384 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 448 -n 64 -k 5329 --alpha 1.0 --a_type f32_r --lda 5329 --b_type f32_r --ldb 5329 --beta 1.0 --c_type f32_r --ldc 448 --d_type f32_r --ldd 448 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 576 -n 96 -k 5041 --alpha 1.0 --a_type f32_r --lda 5041 --b_type f32_r --ldb 5041 --beta 1.0 --c_type f32_r --ldc 576 --d_type f32_r --ldd 576 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 576 -n 96 -k 5329 --alpha 1.0 --a_type f32_r --lda 5329 --b_type f32_r --ldb 5329 --beta 1.0 --c_type f32_r --ldc 576 --d_type f32_r --ldd 576 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1024 -n 2048 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 1024 --d_type f32_r --ldd 1024 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1024 -n 256 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 1024 --d_type f32_r --ldd 1024 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1024 -n 512 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 1024 --d_type f32_r --ldd 1024 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1152 -n 128 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 1152 --d_type f32_r --ldd 1152 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 128 -n 512 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 128 --d_type f32_r --ldd 128 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 2048 -n 512 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 2048 --d_type f32_r --ldd 2048 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 2304 -n 256 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 2304 --d_type f32_r --ldd 2304 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 256 -n 1024 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 256 --d_type f32_r --ldd 256 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 256 -n 512 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 256 --d_type f32_r --ldd 256 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 256 -n 64 -k 3136 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 3136 --beta 1.0 --c_type f32_r --ldc 256 --d_type f32_r --ldd 256 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 4608 -n 512 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 4608 --d_type f32_r --ldd 4608 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 1024 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 128 -k 784 --alpha 1.0 --a_type f32_r --lda 784 --b_type f32_r --ldb 784 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 2048 -k 49 --alpha 1.0 --a_type f32_r --lda 49 --b_type f32_r --ldb 49 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 512 -n 256 -k 196 --alpha 1.0 --a_type f32_r --lda 196 --b_type f32_r --ldb 196 --beta 1.0 --c_type f32_r --ldc 512 --d_type f32_r --ldd 512 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 576 -n 64 -k 3136 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 3136 --beta 1.0 --c_type f32_r --ldc 576 --d_type f32_r --ldd 576 --compute_type f32_r --algo 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 64 -n 256 -k 3136 --alpha 1.0 --a_type f32_r --lda 3136 --b_type f32_r --ldb 3136 --beta 1.0 --c_type f32_r --ldc 64 --d_type f32_r --ldd 64 --compute_type f32_r --algo 0